[Minor] Move lua contrib libraries to lua- prefix

author: Vsevolod Stakhov <vsevolod@highsecure.ru> 2018-05-23 18:14:15 +0100
committer: Vsevolod Stakhov <vsevolod@highsecure.ru> 2018-05-23 18:14:15 +0100
commit: 714eb56e1760fdfb26afccde92664d3a2f1e8435 (patch)
tree: 84d1399acbb92f852b4bd64f9ea5412680b0c6ab /contrib/lua-torch/nn
parent: 220a51ff68013dd668a45b78c60a7b8bfc10f074 (diff)
download: rspamd-714eb56e1760fdfb26afccde92664d3a2f1e8435.tar.gz
rspamd-714eb56e1760fdfb26afccde92664d3a2f1e8435.zip
282 files changed, 43109 insertions, 0 deletions
diff --git a/contrib/lua-torch/nn/.gitignore b/contrib/lua-torch/nn/.gitignore
new file mode 100644
index 000000000..e0fa91eda
--- /dev/null
+++ b/contrib/lua-torch/nn/.gitignore
@@ -0,0 +1,2 @@
+build/
+THNN_h.lua
diff --git a/contrib/lua-torch/nn/.luacheckrc b/contrib/lua-torch/nn/.luacheckrc
new file mode 100644
index 000000000..3d358e9c0
--- /dev/null
+++ b/contrib/lua-torch/nn/.luacheckrc
@@ -0,0 +1,13 @@
+-- -*- mode: lua; -*-
+std = "luajit"
+
+globals = {
+    "torch",
+    "nn",
+    "include",
+}
+
+unused_args = false
+
+
+files['test.lua'].redefined = false
diff --git a/contrib/lua-torch/nn/.travis.yml b/contrib/lua-torch/nn/.travis.yml
new file mode 100644
index 000000000..1d10e0fb5
--- /dev/null
+++ b/contrib/lua-torch/nn/.travis.yml
@@ -0,0 +1,56 @@
+language: c
+compiler:
+  - gcc
+  - clang
+cache:
+  directories:
+  - $HOME/OpenBlasInstall
+sudo: false
+env:
+  - TORCH_LUA_VERSION=LUAJIT21
+  - TORCH_LUA_VERSION=LUA51
+  - TORCH_LUA_VERSION=LUA52
+addons:
+  apt:
+    packages:
+    - cmake
+    - gfortran
+    - gcc-multilib
+    - gfortran-multilib
+    - liblapack-dev
+    - build-essential
+    - gcc
+    - g++
+    - curl
+    - cmake
+    - libreadline-dev
+    - git-core
+    - libqt4-core
+    - libqt4-gui
+    - libqt4-dev
+    - libjpeg-dev
+    - libpng-dev
+    - ncurses-dev
+    - imagemagick
+    - libzmq3-dev
+    - gfortran
+    - unzip
+    - gnuplot
+    - gnuplot-x11
+before_script:
+- export ROOT_TRAVIS_DIR=$(pwd)
+- export INSTALL_PREFIX=~/torch/install
+-  ls $HOME/OpenBlasInstall/lib || (cd /tmp/ && git clone https://github.com/xianyi/OpenBLAS.git -b master && cd OpenBLAS && (make NO_AFFINITY=1 -j$(getconf _NPROCESSORS_ONLN) 2>/dev/null >/dev/null) && make PREFIX=$HOME/OpenBlasInstall install)
+- git clone https://github.com/torch/distro.git ~/torch --recursive
+- cd ~/torch && git submodule update --init --recursive
+- mkdir build && cd build
+- export CMAKE_LIBRARY_PATH=$HOME/OpenBlasInstall/include:$HOME/OpenBlasInstall/lib:$CMAKE_LIBRARY_PATH
+- cmake .. -DCMAKE_INSTALL_PREFIX="${INSTALL_PREFIX}" -DCMAKE_BUILD_TYPE=Release -DWITH_${TORCH_LUA_VERSION}=ON
+- make && make install
+- cd $ROOT_TRAVIS_DIR
+- export LD_LIBRARY_PATH=${INSTALL_PREFIX}/lib:$LD_LIBRARY_PATH
+script:
+- ${INSTALL_PREFIX}/bin/luarocks make rocks/nn-scm-1.rockspec
+- export PATH=${INSTALL_PREFIX}/bin:$PATH
+- export TESTLUA=$(which luajit lua | head -n 1)
+- ${TESTLUA} -lnn -e "t=nn.test(); if t.errors[1] then os.exit(1) end"
diff --git a/contrib/lua-torch/nn/Abs.lua b/contrib/lua-torch/nn/Abs.lua
new file mode 100644
index 000000000..b32b64f79
--- /dev/null
+++ b/contrib/lua-torch/nn/Abs.lua
@@ -0,0 +1,22 @@
+local Abs, parent = torch.class('nn.Abs', 'nn.Module')
+
+function Abs:__init()
+   parent.__init(self)
+end
+
+function Abs:updateOutput(input)
+   input.THNN.Abs_updateOutput(
+      input:cdata(),
+      self.output:cdata()
+   )
+   return self.output
+end
+
+function Abs:updateGradInput(input, gradOutput)
+   input.THNN.Abs_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata()
+   )
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/AbsCriterion.lua b/contrib/lua-torch/nn/AbsCriterion.lua
new file mode 100644
index 000000000..65e2f8ae1
--- /dev/null
+++ b/contrib/lua-torch/nn/AbsCriterion.lua
@@ -0,0 +1,32 @@
+local AbsCriterion, parent = torch.class('nn.AbsCriterion', 'nn.Criterion')
+
+function AbsCriterion:__init(sizeAverage)
+   parent.__init(self)
+   if sizeAverage ~= nil then
+      self.sizeAverage = sizeAverage
+   else
+      self.sizeAverage = true
+   end
+end
+
+function AbsCriterion:updateOutput(input, target)
+   self.output_tensor = self.output_tensor or input.new(1)
+   input.THNN.AbsCriterion_updateOutput(
+      input:cdata(),
+      target:cdata(),
+      self.output_tensor:cdata(),
+      self.sizeAverage
+   )
+   self.output = self.output_tensor[1]
+   return self.output
+end
+
+function AbsCriterion:updateGradInput(input, target)
+   input.THNN.AbsCriterion_updateGradInput(
+      input:cdata(),
+      target:cdata(),
+      self.gradInput:cdata(),
+      self.sizeAverage
+   )
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/Add.lua b/contrib/lua-torch/nn/Add.lua
new file mode 100644
index 000000000..d071a15b3
--- /dev/null
+++ b/contrib/lua-torch/nn/Add.lua
@@ -0,0 +1,66 @@
+local Add, parent = torch.class('nn.Add', 'nn.Module')
+
+function Add:__init(inputSize,scalar)
+   parent.__init(self)
+
+   local size = inputSize
+   if scalar then size=1 end
+   self.scalar = scalar
+   self.bias = torch.Tensor(size)
+   self.gradBias = torch.Tensor(size)
+
+   self._ones = torch.Tensor{1}
+
+   self:reset()
+end
+
+function Add:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1./math.sqrt(self.bias:size(1))
+   end
+
+   self.bias:uniform(-stdv, stdv)
+end
+
+function Add:updateOutput(input)
+   self.output:resizeAs(input):copy(input)
+   if self.scalar then
+      self.output:add(self.bias[1]);
+   else
+      if input:isSameSizeAs(self.bias) then
+         self.output:add(self.bias)
+      else
+         local batchSize = input:size(1)
+         if self._ones:size(1) ~= batchSize then
+            self._ones:resize(batchSize):fill(1)
+         end
+         local bias = self.bias:view(-1)
+         local output = self.output:view(batchSize, -1)
+         output:addr(1, self._ones, bias)
+      end
+   end
+   return self.output
+end
+
+function Add:updateGradInput(input, gradOutput)
+   if self.gradInput then
+      self.gradInput:resizeAs(gradOutput):copy(gradOutput)
+      return self.gradInput
+   end
+end
+
+function Add:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   if self.gradBias:size(1) == 1 then
+      self.gradBias[1] = self.gradBias[1] + scale*gradOutput:sum();
+   else
+      if input:isSameSizeAs(self.bias) then
+         self.gradBias:add(scale, gradOutput)
+      else
+         local gradOutput = gradOutput:view(input:size(1), -1)
+         self.gradBias:view(-1):addmv(scale, gradOutput:t(), self._ones)
+      end
+   end
+end
diff --git a/contrib/lua-torch/nn/AddConstant.lua b/contrib/lua-torch/nn/AddConstant.lua
new file mode 100644
index 000000000..b686d719c
--- /dev/null
+++ b/contrib/lua-torch/nn/AddConstant.lua
@@ -0,0 +1,50 @@
+local AddConstant, parent = torch.class('nn.AddConstant', 'nn.Module')
+
+function AddConstant:__init(constant_scalar,ip)
+   parent.__init(self)
+   self.constant_scalar = constant_scalar
+
+  -- default for inplace is false
+   self.inplace = ip or false
+   if (ip and type(ip) ~= 'boolean') then
+      error('in-place flag must be boolean')
+   end
+end
+
+function AddConstant:updateOutput(input)
+   assert(type(self.constant_scalar) == 'number' or
+      (torch.isTensor(self.constant_scalar) and input:nDimension() <= 2 and
+      input:size(input:nDimension()) == self.constant_scalar:size(1)),
+      'input is not scalar or doesn\'t match with the dimension of constant!')
+   local tmp
+   if torch.isTensor(self.constant_scalar) and input:nDimension() == 2 then
+      local nOutput = self.constant_scalar:size(1)
+      tmp = self.constant_scalar.new()
+      tmp:resize(1,nOutput)
+      tmp:copy(self.constant_scalar)
+      tmp = tmp:expand(input:size(1),nOutput)
+   else
+      tmp = self.constant_scalar
+   end
+   if self.inplace then
+      input:add(tmp)
+      self.output:set(input)
+   else
+      self.output:resizeAs(input)
+      self.output:copy(input)
+      self.output:add(tmp)
+   end
+   return self.output
+end
+
+function AddConstant:updateGradInput(input, gradOutput)
+   if self.inplace then
+      self.gradInput:set(gradOutput)
+      -- restore previous input value
+      input:add(-self.constant_scalar)
+   else
+      self.gradInput:resizeAs(gradOutput)
+      self.gradInput:copy(gradOutput)
+   end
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/BCECriterion.lua b/contrib/lua-torch/nn/BCECriterion.lua
new file mode 100644
index 000000000..8bb5f8178
--- /dev/null
+++ b/contrib/lua-torch/nn/BCECriterion.lua
@@ -0,0 +1,64 @@
+local THNN = require 'nn.THNN'
+local BCECriterion, parent = torch.class('nn.BCECriterion', 'nn.Criterion')
+
+function BCECriterion:__init(weights, sizeAverage)
+   parent.__init(self)
+   if sizeAverage ~= nil then
+      self.sizeAverage = sizeAverage
+   else
+      self.sizeAverage = true
+   end
+   if weights ~= nil then
+      assert(weights:dim() == 1, "weights input should be 1-D Tensor")
+      self.weights = weights
+   end
+end
+
+
+function BCECriterion:__len()
+   return self.weights and #self.weights or 0
+end
+
+function BCECriterion:updateOutput(input, target)
+   -- - log(input) * target - log(1 - input) * (1 - target)
+   assert( input:nElement() == target:nElement(),
+   "input and target size mismatch")
+   self.output_tensor = self.output_tensor or input.new(1)
+
+   local weights = self.weights
+   if weights ~= nil and target:dim() ~= 1 then
+      weights = self.weights:view(1, target:size(2)):expandAs(target)
+   end
+
+   input.THNN.BCECriterion_updateOutput(
+      input:cdata(),
+      target:cdata(),
+      self.output_tensor:cdata(),
+      self.sizeAverage,
+      THNN.optionalTensor(weights)
+   )
+
+   self.output = self.output_tensor[1]
+   return self.output
+end
+
+function BCECriterion:updateGradInput(input, target)
+   -- - (target - input) / ( input (1 - input) )
+   assert( input:nElement() == target:nElement(),
+   "input and target size mismatch")
+
+   local weights = self.weights
+   if weights ~= nil and target:dim() ~= 1 then
+      weights = self.weights:view(1, target:size(2)):expandAs(target)
+   end
+
+   input.THNN.BCECriterion_updateGradInput(
+      input:cdata(),
+      target:cdata(),
+      self.gradInput:cdata(),
+      self.sizeAverage,
+      THNN.optionalTensor(weights)
+   )
+
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/BatchNormalization.lua b/contrib/lua-torch/nn/BatchNormalization.lua
new file mode 100644
index 000000000..8dfc576b3
--- /dev/null
+++ b/contrib/lua-torch/nn/BatchNormalization.lua
@@ -0,0 +1,213 @@
+--[[
+   This file implements Batch Normalization as described in the paper:
+   "Batch Normalization: Accelerating Deep Network Training
+                         by Reducing Internal Covariate Shift"
+                   by Sergey Ioffe, Christian Szegedy
+
+   This implementation is useful for inputs NOT coming from convolution layers.
+   For convolution layers, use nn.SpatialBatchNormalization.
+
+   The operation implemented is:
+   y =     ( x - mean(x) )
+        -------------------- * gamma + beta
+        standard-deviation(x)
+   where gamma and beta are learnable parameters.
+
+   The learning of gamma and beta is optional.
+
+   Usage:
+   with    learnable parameters: nn.BatchNormalization(N [,eps] [,momentum])
+                                 where N = dimensionality of input
+   without learnable parameters: nn.BatchNormalization(N [,eps] [,momentum], false)
+
+   eps is a small value added to the standard-deviation to avoid divide-by-zero.
+       Defaults to 1e-5
+
+   In training time, this layer keeps a running estimate of it's computed mean and std.
+   The running sum is kept with a default momentum of 0.1 (unless over-ridden)
+   In test time, this running mean/std is used to normalize.
+]]--
+local BN,parent = torch.class('nn.BatchNormalization', 'nn.Module')
+local THNN = require 'nn.THNN'
+
+BN.__version = 2
+
+-- expected dimension of input
+BN.nDim = 2
+
+function BN:__init(nOutput, eps, momentum, affine)
+   parent.__init(self)
+   assert(nOutput and type(nOutput) == 'number',
+          'Missing argument #1: dimensionality of input. ')
+   assert(nOutput ~= 0, 'To set affine=false call BatchNormalization'
+     .. '(nOutput,  eps, momentum, false) ')
+   if affine ~= nil then
+      assert(type(affine) == 'boolean', 'affine has to be true/false')
+      self.affine = affine
+   else
+      self.affine = true
+   end
+   self.eps = eps or 1e-5
+   self.train = true
+   self.momentum = momentum or 0.1
+   self.running_mean = torch.zeros(nOutput)
+   self.running_var = torch.ones(nOutput)
+
+   if self.affine then
+      self.weight = torch.Tensor(nOutput)
+      self.bias = torch.Tensor(nOutput)
+      self.gradWeight = torch.Tensor(nOutput)
+      self.gradBias = torch.Tensor(nOutput)
+      self:reset()
+   end
+end
+
+function BN:reset()
+   if self.weight then
+      self.weight:uniform()
+   end
+   if self.bias then
+      self.bias:zero()
+   end
+   self.running_mean:zero()
+   self.running_var:fill(1)
+end
+
+function BN:checkInputDim(input)
+   local iDim = input:dim()
+   assert(iDim == self.nDim or
+              (iDim == self.nDim - 1 and self.train == false), string.format(
+      'only mini-batch supported (%dD tensor), got %dD tensor instead',
+      self.nDim, iDim))
+   local featDim = (iDim == self.nDim - 1) and 1 or 2
+   assert(input:size(featDim) == self.running_mean:nElement(), string.format(
+      'got %d-feature tensor, expected %d',
+      input:size(featDim), self.running_mean:nElement()))
+end
+
+local function makeContiguous(self, input, gradOutput)
+   if not input:isContiguous() then
+      self._input = self._input or input.new()
+      self._input:resizeAs(input):copy(input)
+      input = self._input
+   end
+   if gradOutput then
+      if not gradOutput:isContiguous() then
+         self._gradOutput = self._gradOutput or gradOutput.new()
+         self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
+         gradOutput = self._gradOutput
+      end
+   end
+   return input, gradOutput
+end
+
+local function makeBatch(self, input)
+    local iDim = input:dim()
+    if self.train == false and iDim == self.nDim - 1 then
+        return nn.utils.addSingletonDimension(input, input, 1)
+    else
+        return input
+    end
+end
+
+function BN:updateOutput(input)
+   self:checkInputDim(input)
+
+   input = makeContiguous(self, input)
+   input = makeBatch(self, input)
+
+   self.save_mean = self.save_mean or input.new()
+   self.save_mean:resizeAs(self.running_mean)
+   self.save_std = self.save_std or input.new()
+   self.save_std:resizeAs(self.running_var)
+
+   input.THNN.BatchNormalization_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      THNN.optionalTensor(self.weight),
+      THNN.optionalTensor(self.bias),
+      self.running_mean:cdata(),
+      self.running_var:cdata(),
+      self.save_mean:cdata(),
+      self.save_std:cdata(),
+      self.train,
+      self.momentum,
+      self.eps)
+
+   return self.output
+end
+
+local function backward(self, input, gradOutput, scale, gradInput, gradWeight, gradBias)
+   self:checkInputDim(input)
+   self:checkInputDim(gradOutput)
+   assert(self.save_mean and self.save_std, 'must call :updateOutput() first')
+
+   input, gradOutput = makeContiguous(self, input, gradOutput)
+   input = makeBatch(self, input)
+   gradOutput = makeBatch(self, gradOutput)
+
+   scale = scale or 1
+   if gradInput then
+      gradInput:resizeAs(gradOutput)
+   end
+
+   input.THNN.BatchNormalization_backward(
+      input:cdata(),
+      gradOutput:cdata(),
+      THNN.optionalTensor(gradInput),
+      THNN.optionalTensor(gradWeight),
+      THNN.optionalTensor(gradBias),
+      THNN.optionalTensor(self.weight),
+      self.running_mean:cdata(),
+      self.running_var:cdata(),
+      self.save_mean:cdata(),
+      self.save_std:cdata(),
+      self.train,
+      scale,
+      self.eps)
+
+   return self.gradInput
+end
+
+function BN:backward(input, gradOutput, scale)
+   return backward(self, input, gradOutput, scale, self.gradInput, self.gradWeight, self.gradBias)
+end
+
+function BN:updateGradInput(input, gradOutput)
+   return backward(self, input, gradOutput, 1, self.gradInput)
+end
+
+function BN:accGradParameters(input, gradOutput, scale)
+   return backward(self, input, gradOutput, scale, nil, self.gradWeight, self.gradBias)
+end
+
+function BN:read(file, version)
+   parent.read(self, file)
+   if version < 2 then
+      if self.running_std then
+         self.running_var = self.running_std:pow(-2):add(-self.eps)
+         self.running_std = nil
+      end
+   end
+end
+
+function BN:clearState()
+   -- first 5 buffers are not present in the current implementation,
+   -- but we keep them for cleaning old saved models
+   nn.utils.clear(self, {
+      'buffer',
+      'buffer2',
+      'centered',
+      'std',
+      'normalized',
+      '_input',
+      '_gradOutput',
+      'save_mean',
+      'save_std',
+   })
+   return parent.clearState(self)
+end
+
+function BN:__tostring__()
+   return string.format('%s (%dD) (%d)', torch.type(self), self.nDim, self.running_mean:nElement())
+end
diff --git a/contrib/lua-torch/nn/Bilinear.lua b/contrib/lua-torch/nn/Bilinear.lua
new file mode 100644
index 000000000..9350b03ec
--- /dev/null
+++ b/contrib/lua-torch/nn/Bilinear.lua
@@ -0,0 +1,163 @@
+local Bilinear, parent = torch.class('nn.Bilinear', 'nn.Module')
+
+local function isint(x) return type(x) == 'number' and x == math.floor(x) end
+function Bilinear:__assertInput(input)
+   assert(input and type(input) == 'table' and #input == 2,
+      'input should be a table containing two data Tensors')
+   assert(input[1]:nDimension() == 2 and input[2]:nDimension() == 2,
+      'input Tensors should be two-dimensional')
+   assert(input[1]:size(1) == input[2]:size(1),
+      'input Tensors should have the same number of rows (instances)')
+   assert(input[1]:size(2) == self.weight:size(2),
+      'dimensionality of first input is erroneous')
+   assert(input[2]:size(2) == self.weight:size(3),
+      'dimensionality of second input is erroneous')
+end
+function Bilinear:__assertInputGradOutput(input, gradOutput)
+   assert(input[1]:size(1) == gradOutput:size(1),
+      'number of rows in gradOutput does not match input')
+   assert(gradOutput:size(2) == self.weight:size(1),
+      'number of columns in gradOutput does not output size of layer')
+end
+
+function Bilinear:__init(inputSize1, inputSize2, outputSize, bias)
+
+   -- assertions:
+   assert(self and inputSize1 and inputSize2 and outputSize,
+      'should specify inputSize1 and inputSize2 and outputSize')
+   assert(isint(inputSize1) and isint(inputSize2) and isint(outputSize),
+      'inputSize1 and inputSize2 and outputSize should be integer numbers')
+   assert(inputSize1 > 0 and inputSize2 > 0 and outputSize > 0,
+      'inputSize1 and inputSize2 and outputSize should be positive numbers')
+
+   -- set up model:
+   parent.__init(self)
+   local bias = ((bias == nil) and true) or bias
+   self.weight     = torch.Tensor(outputSize, inputSize1, inputSize2)
+   self.gradWeight = torch.Tensor(outputSize, inputSize1, inputSize2)
+   if bias then
+      self.bias     = torch.Tensor(outputSize)
+      self.gradBias = torch.Tensor(outputSize)
+   end
+   self.gradInput = {torch.Tensor(), torch.Tensor()}
+   self:reset()
+end
+
+function Bilinear:reset(stdv)
+   assert(self)
+   if stdv then
+      assert(stdv and type(stdv) == 'number' and stdv > 0,
+         'standard deviation should be a positive number')
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1 / math.sqrt(self.weight:size(2))
+   end
+   self.weight:uniform(-stdv, stdv)
+   if self.bias then self.bias:uniform(-stdv, stdv) end
+   return self
+end
+
+function Bilinear:updateOutput(input)
+   assert(self)
+   self:__assertInput(input)
+
+   -- set up buffer:
+   self.buff2 = self.buff2 or input[1].new()
+   self.buff2:resizeAs(input[2])
+
+   -- compute output scores:
+   self.output:resize(input[1]:size(1), self.weight:size(1))
+   for k = 1,self.weight:size(1) do
+      torch.mm(self.buff2, input[1], self.weight[k])
+      self.buff2:cmul(input[2])
+      torch.sum(self.output:narrow(2, k, 1), self.buff2, 2)
+   end
+   if self.bias then
+       self.output:add(
+           self.bias:reshape(1, self.bias:nElement()):expandAs(self.output)
+       )
+   end
+   return self.output
+end
+
+function Bilinear:updateGradInput(input, gradOutput)
+   assert(self)
+   if self.gradInput then
+      self:__assertInputGradOutput(input, gradOutput)
+
+      if #self.gradInput == 0 then
+          for i = 1, 2 do self.gradInput[i] = input[1].new() end
+      end
+
+      -- compute d output / d input:
+      self.gradInput[1]:resizeAs(input[1]):fill(0)
+      self.gradInput[2]:resizeAs(input[2]):fill(0)
+
+
+       -- do first slice of weight tensor (k = 1)
+      self.gradInput[1]:mm(input[2], self.weight[1]:t())
+      self.gradInput[1]:cmul(gradOutput:narrow(2,1,1):expand(self.gradInput[1]:size(1),
+          self.gradInput[1]:size(2)))
+      self.gradInput[2]:addmm(1, input[1], self.weight[1])
+      self.gradInput[2]:cmul(gradOutput:narrow(2,1,1):expand(self.gradInput[2]:size(1),
+          self.gradInput[2]:size(2)))
+
+      -- do remaining slices of weight tensor
+      if self.weight:size(1) > 1 then
+         self.buff1 = self.buff1 or input[1].new()
+         self.buff1:resizeAs(input[1])
+
+         for k = 2, self.weight:size(1) do
+            self.buff1:mm(input[2], self.weight[k]:t())
+            self.buff1:cmul(gradOutput:narrow(2,k,1):expand(self.gradInput[1]:size(1),
+              self.gradInput[1]:size(2)))
+            self.gradInput[1]:add(self.buff1)
+
+            self.buff2:mm(input[1], self.weight[k])
+            self.buff2:cmul(gradOutput:narrow(2,k,1):expand(self.gradInput[2]:size(1),
+              self.gradInput[2]:size(2)))
+            self.gradInput[2]:add(self.buff2)
+         end
+      end
+      return self.gradInput
+   end
+end
+
+function Bilinear:accGradParameters(input, gradOutput, scale)
+   local scale = scale or 1
+   self:__assertInputGradOutput(input, gradOutput)
+   assert(scale and type(scale) == 'number' and scale >= 0)
+
+   -- make sure we have buffer:
+   self.buff1 = self.buff1 or input[1].new()
+   self.buff1:resizeAs(input[1])
+
+   -- accumulate parameter gradients:
+   for k = 1,self.weight:size(1) do
+      torch.cmul(
+         self.buff1, input[1], gradOutput:narrow(2, k, 1):expandAs(input[1])
+      )
+      self.gradWeight[k]:addmm(self.buff1:t(), input[2])
+   end
+   if self.bias then self.gradBias:add(scale, gradOutput:sum(1)) end
+end
+
+function Bilinear:sharedAccUpdateGradParameters(input, gradOutput, lr)
+   -- we do not need to accumulate parameters when sharing:
+   self:defaultAccUpdateGradParameters(input, gradOutput, lr)
+end
+
+function Bilinear:__tostring__()
+  return torch.type(self) ..
+      string.format(
+         '(%dx%d -> %d) %s',
+         self.weight:size(2), self.weight:size(3), self.weight:size(1),
+         (self.bias == nil and ' without bias' or '')
+      )
+end
+
+function Bilinear:clearState()
+   if self.buff2 then self.buff2:set() end
+   if self.buff1 then self.buff1:set() end
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/Bottle.lua b/contrib/lua-torch/nn/Bottle.lua
new file mode 100644
index 000000000..6dee432f5
--- /dev/null
+++ b/contrib/lua-torch/nn/Bottle.lua
@@ -0,0 +1,71 @@
+local Bottle, parent = torch.class("nn.Bottle", "nn.Decorator")
+local unpack = unpack or table.unpack
+
+function Bottle:__init(module, nInputDim, nOutputDim)
+   parent.__init(self, module)
+   self.nInputDim = nInputDim or 2
+   self.nOutputDim = nOutputDim or self.nInputDim
+   self.dimDelta = self.nInputDim - self.nOutputDim
+   -- Used to reshape the gradients
+   self.inShape = torch.Tensor(self.nInputDim)
+   self.outShape = torch.Tensor(self.nOutputDim)
+end
+
+function Bottle:updateOutput(input)
+   -- first batchDims dimensions will be fused
+   local batchDims = input:dim() - self.nInputDim + 1
+   -- see if bottle is required
+   if batchDims > 1 then
+      -- bottle the first dims
+      local inSize = torch.LongTensor(input:size())
+      local squeezeSize = inSize[{{1, batchDims - 1}}]:prod()
+      self.inShape:copy(inSize[{{batchDims, input:dim()}}])
+      self.inShape[{{1}}]:mul(squeezeSize)
+      -- Forward with the module's dimension
+      local newInput = input:view(unpack(self.inShape:totable()))
+      local output = self.modules[1]:updateOutput(newInput)
+      assert(output:dim() == self.nOutputDim,
+	     "Wrong number of output dims on module. Expected: " ..
+		self.nOutputDim .. ' but got ' ..
+		tostring(output and output:dim()))
+      self.outShape:copy(torch.LongTensor(output:size()))
+      if math.abs(self.dimDelta) > 0 then
+         inSize:resize(inSize:size(1) - self.dimDelta)
+      end
+      inSize[{{batchDims, inSize:size(1)}}]:copy(self.outShape)
+      inSize[{{batchDims}}]:div(squeezeSize)
+      -- unbottle
+      self.output:set(output:view(unpack(torch.totable(inSize))))
+   else
+      self.output:set(self.modules[1]:updateOutput(input))
+   end
+   return self.output
+end
+
+function Bottle:updateGradInput(input, gradOutput)
+   if input:dim() > self.nInputDim then
+      local input_ = input:view(unpack(self.inShape:totable()))
+      local gradOutput_ = gradOutput:view(unpack(self.outShape:totable()))
+      self.modules[1]:updateGradInput(input_, gradOutput_)
+      if self.modules[1].gradInput then
+         self.gradInput:set(self.modules[1].gradInput:viewAs(input))
+      else
+         self.gradInput = nil
+      end
+   else
+      if self.modules[1].gradInput then
+         self.gradInput:set(self.modules[1]:updateGradInput(input, gradOutput))
+      else
+         self.gradInput = nil
+      end
+   end
+   return self.gradInput
+end
+
+function Bottle:accGradParameters(input, gradOutput, scale)
+   if input:dim() > self.nInputDim then
+      input = input:view(unpack(self.inShape:totable()))
+      gradOutput = gradOutput:view(unpack(self.outShape:totable()))
+   end
+   self.modules[1]:accGradParameters(input, gradOutput, scale)
+end
diff --git a/contrib/lua-torch/nn/CAdd.lua b/contrib/lua-torch/nn/CAdd.lua
new file mode 100644
index 000000000..1d7b45726
--- /dev/null
+++ b/contrib/lua-torch/nn/CAdd.lua
@@ -0,0 +1,127 @@
+local CAdd, parent = torch.class("nn.CAdd", "nn.Module")
+
+function CAdd:__init(...)
+   parent.__init(self)
+
+   local arg = {...}
+
+   self.size = torch.LongStorage()
+   local n = #arg
+   if n == 1 and torch.type(arg[1]) == 'torch.LongStorage' then
+      self.size:resize(#arg[1]):copy(arg[1])
+   else
+     self.size:resize(n)
+     for i=1,n do
+         self.size[i] = arg[i]
+     end
+   end
+
+   self.bias = torch.Tensor(self.size)
+   self.gradBias = torch.Tensor(self.size)
+
+   self.output:resize(self.size)
+
+   self:reset()
+end
+
+function CAdd:reset(stdv)
+   if stdv then
+      --std of uniform distribution on interval [-a,a] = a/sqrt(3)
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1.0/math.sqrt(self.bias:nElement())
+   end
+   self.bias:uniform(-stdv,stdv)
+end
+
+function CAdd:updateOutput(input)
+   self._output = self._output or input.new()
+   self._bias = self._bias or input.new()
+   self._expand = self._expand or input.new()
+   self._repeat = self._repeat or input.new()
+
+   self.output:resizeAs(input):copy(input)
+   if input:nElement() == self.bias:nElement() then
+      self.output:add(self.bias)
+   else
+      if self.bias:dim() == input:dim() then
+         self._output:set(self.output)
+         self._bias:set(self.bias)
+      else
+         local batchSize = input:size(1)
+         self._output:view(self.output, batchSize, -1)
+         self._bias:view(self.bias, 1, -1)
+      end
+
+      self._expand:expandAs(self._bias, self._output)
+
+      --expandAs uses stride 0 and self._expand is not contiguous
+      --cuda ops may assume contiguous input
+      if torch.type(input) == 'torch.CudaTensor' then
+         self._repeat:resizeAs(self._expand):copy(self._expand)
+         self._output:add(self._repeat)
+      else
+         self._output:add(self._expand)
+      end
+   end
+
+   return self.output
+end
+
+function CAdd:updateGradInput(input, gradOutput)
+   self.gradInput = self.gradInput or input.new()
+   self.gradInput:resizeAs(gradOutput):copy(gradOutput)
+
+   return self.gradInput
+end
+
+function CAdd:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+
+   self._gradBias = self._gradBias or gradOutput.new()
+   self._gradOutput = self._gradOutput or gradOutput.new()
+   self._repeat = self._repeat or gradOutput.new()
+
+   if self.bias:nElement() == gradOutput:nElement() then
+      self.gradBias:add(scale, gradOutput)
+   else
+      if self.bias:dim() == gradOutput:dim() then
+         self._gradBias:set(self.gradBias)
+         self._gradOutput:set(gradOutput)
+      else
+         local batchSize = input:size(1)
+         self._gradBias:view(self.gradBias, 1, -1)
+         self._gradOutput:view(gradOutput, batchSize, -1)
+      end
+
+      self._gradBias:expandAs(self._gradBias, self._gradOutput)
+
+      --expandAs uses stride 0 and self._gradBias is not contiguous
+      --cuda ops may assume contiguous input
+      if torch.type(self._gradBias) == 'torch.CudaTensor' then
+         self._repeat:resizeAs(self._gradBias):copy(self._gradBias)
+         self._repeat:add(scale, self._gradOutput)
+         self._gradBias:copy(self._repeat)
+      else
+         self._gradBias:add(scale, self._gradOutput)
+      end
+   end
+end
+
+function CAdd:type(type, tensorCache)
+   if type then
+      self:clearState()
+   end
+   return parent.type(self, type, tensorCache)
+end
+
+function CAdd:clearState()
+   nn.utils.clear(self, {
+      '_gradBias',
+      '_expand',
+      '_output',
+      '_bias',
+      '_repeat'
+   })
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/CAddTable.lua b/contrib/lua-torch/nn/CAddTable.lua
new file mode 100644
index 000000000..79deb7e9b
--- /dev/null
+++ b/contrib/lua-torch/nn/CAddTable.lua
@@ -0,0 +1,36 @@
+local CAddTable, parent = torch.class('nn.CAddTable', 'nn.Module')
+
+function CAddTable:__init(ip)
+   parent.__init(self)
+   self.inplace = ip
+   self.gradInput = {}
+end
+
+function CAddTable:updateOutput(input)
+   if self.inplace then
+      self.output:set(input[1])
+   else
+      self.output:resizeAs(input[1]):copy(input[1])
+   end
+   for i=2,#input do
+      self.output:add(input[i])
+   end
+   return self.output
+end
+
+function CAddTable:updateGradInput(input, gradOutput)
+   for i=1,#input do
+      self.gradInput[i] = self.gradInput[i] or input[1].new()
+      if self.inplace then
+         self.gradInput[i]:set(gradOutput)
+      else
+         self.gradInput[i]:resizeAs(input[i]):copy(gradOutput)
+      end
+   end
+
+   for i=#input+1, #self.gradInput do
+       self.gradInput[i] = nil
+   end
+
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/CAddTensorTable.lua b/contrib/lua-torch/nn/CAddTensorTable.lua
new file mode 100644
index 000000000..16efe4450
--- /dev/null
+++ b/contrib/lua-torch/nn/CAddTensorTable.lua
@@ -0,0 +1,43 @@
+
+local CAddTensorTable, parent = torch.class('nn.CAddTensorTable', 'nn.Module')
+
+function CAddTensorTable:__init()
+   parent.__init(self)
+   self.gradInput = {}
+end
+
+-- input is a table with 2 entries. input[1] is the vector to be added.
+-- input[2] is the table to which we add the vector
+function CAddTensorTable:updateOutput(input)
+  local currentOutput = {}
+  for i=1,#input[2] do
+    currentOutput[i] = currentOutput[i] or input[1].new()
+    currentOutput[i]:resizeAs(input[1])
+    currentOutput[i]:copy(input[2][i])
+    currentOutput[i]:add(input[1])
+  end
+  for i = #input[2]+1, #currentOutput do
+    currentOutput[i] = nil
+  end
+  self.output = currentOutput
+  return self.output
+end
+
+function CAddTensorTable:updateGradInput(input, gradOutput)
+  self.gradInput[1] = self.gradInput[1] or input[1].new()
+  self.gradInput[1]:resizeAs(input[1])
+  self.gradInput[1]:copy(gradOutput[1])
+  for i=2, #input[2] do
+    self.gradInput[1]:add(gradOutput[i])
+  end
+  self.gradInput[2] = self.gradInput[2] or {}
+  for i=1,#input[2] do
+    self.gradInput[2][i] = self.gradInput[2][i] or input[1].new()
+    self.gradInput[2][i]:resizeAs(input[1])
+    self.gradInput[2][i]:copy(gradOutput[i])
+  end
+  for i=#input[2]+1, #self.gradInput[2] do
+     self.gradInput[2][i] = nil
+  end
+  return self.gradInput
+end
+\ No newline at end of file
diff --git a/contrib/lua-torch/nn/CDivTable.lua b/contrib/lua-torch/nn/CDivTable.lua
new file mode 100644
index 000000000..bf044c9af
--- /dev/null
+++ b/contrib/lua-torch/nn/CDivTable.lua
@@ -0,0 +1,26 @@
+
+local CDivTable, parent = torch.class('nn.CDivTable', 'nn.Module')
+
+function CDivTable:__init()
+   parent.__init(self)
+   self.gradInput = {}
+end
+
+function CDivTable:updateOutput(input)
+   self.output:resizeAs(input[1]):copy(input[1])
+   self.output:cdiv(input[2])
+   return self.output
+end
+
+function CDivTable:updateGradInput(input, gradOutput)
+   self.gradInput[1] = self.gradInput[1] or input[1].new()
+   self.gradInput[2] = self.gradInput[2] or input[1].new()
+   self.gradInput[1]:resizeAs(input[1]):copy(gradOutput):cdiv(input[2])
+   self.gradInput[2]:resizeAs(input[2]):zero():addcdiv(-1,self.gradInput[1],input[2]):cmul(input[1])
+
+   for i=#input+1, #self.gradInput do
+       self.gradInput[i] = nil
+   end
+
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/CMakeLists.txt b/contrib/lua-torch/nn/CMakeLists.txt
new file mode 100644
index 000000000..cebddfbfc
--- /dev/null
+++ b/contrib/lua-torch/nn/CMakeLists.txt
@@ -0,0 +1,14 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR)
+CMAKE_POLICY(VERSION 2.6)
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../torch7/lib/TH)
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/../torch7/lib/TH)
+ADD_SUBDIRECTORY(lib)
+
+FILE(STRINGS lib/THNN/generic/THNN.h THNN_headers NEWLINE_CONSUME)
+FILE(WRITE THNN_h.lua "return [[")
+FILE(APPEND THNN_h.lua ${THNN_headers})
+FILE(APPEND THNN_h.lua "]]")
+
+FILE(GLOB luasrc *.lua)
+
+ADD_TORCH_PACKAGE(nn "" "${luasrc}")
diff --git a/contrib/lua-torch/nn/CMaxTable.lua b/contrib/lua-torch/nn/CMaxTable.lua
new file mode 100644
index 000000000..845e38d23
--- /dev/null
+++ b/contrib/lua-torch/nn/CMaxTable.lua
@@ -0,0 +1,46 @@
+local CMaxTable, parent = torch.class('nn.CMaxTable', 'nn.Module')
+
+function CMaxTable:__init()
+   parent.__init(self)
+   self.gradInput = {}
+   self.maxIdx = torch.Tensor()
+   self.mask = torch.Tensor()
+   self.maxVals = torch.Tensor()
+   self.gradMaxVals = torch.Tensor()
+end
+
+function CMaxTable:updateOutput(input)
+   self.output:resizeAs(input[1]):copy(input[1])
+   self.maxIdx:resizeAs(input[1]):fill(1)
+   for i=2,#input do
+      self.maskByteTensor = self.maskByteTensor or
+         (torch.type(self.output) == 'torch.CudaTensor' and
+         torch.CudaByteTensor() or torch.ByteTensor())
+      self.mask:gt(input[i], self.output)
+      self.maskByteTensor:resize(self.mask:size()):copy(self.mask)
+      self.maxIdx:maskedFill(self.maskByteTensor, i)
+      self.maxVals:maskedSelect(input[i], self.maskByteTensor)
+      self.output:maskedCopy(self.maskByteTensor, self.maxVals)
+   end
+   return self.output
+end
+
+function CMaxTable:updateGradInput(input, gradOutput)
+   for i=1,#input do
+      self.gradInput[i] = self.gradInput[i] or input[i].new()
+      self.gradInput[i]:resizeAs(input[i]):fill(0.0)
+      self.maskByteTensor = self.maskByteTensor or
+         (torch.type(self.output) == 'torch.CudaTensor' and
+         torch.CudaByteTensor() or torch.ByteTensor())
+      self.mask:eq(self.maxIdx, i)
+      self.maskByteTensor:resize(self.mask:size()):copy(self.mask)
+      self.gradMaxVals:maskedSelect(gradOutput, self.maskByteTensor)
+      self.gradInput[i]:maskedCopy(self.maskByteTensor, self.gradMaxVals)
+   end
+
+   for i=#input+1, #self.gradInput do
+       self.gradInput[i] = nil
+   end
+
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/CMinTable.lua b/contrib/lua-torch/nn/CMinTable.lua
new file mode 100644
index 000000000..25b9a19a2
--- /dev/null
+++ b/contrib/lua-torch/nn/CMinTable.lua
@@ -0,0 +1,46 @@
+local CMinTable, parent = torch.class('nn.CMinTable', 'nn.Module')
+
+function CMinTable:__init()
+   parent.__init(self)
+   self.gradInput = {}
+   self.minIdx = torch.Tensor()
+   self.mask = torch.Tensor()
+   self.minVals = torch.Tensor()
+   self.gradMaxVals = torch.Tensor()
+end
+
+function CMinTable:updateOutput(input)
+   self.output:resizeAs(input[1]):copy(input[1])
+   self.minIdx:resizeAs(input[1]):fill(1)
+   for i=2,#input do
+      self.maskByteTensor = self.maskByteTensor or
+         (torch.type(self.output) == 'torch.CudaTensor' and
+         torch.CudaByteTensor() or torch.ByteTensor())
+      self.mask:lt(input[i], self.output)
+      self.maskByteTensor:resize(self.mask:size()):copy(self.mask)
+      self.minIdx:maskedFill(self.maskByteTensor, i)
+      self.minVals:maskedSelect(input[i], self.maskByteTensor)
+      self.output:maskedCopy(self.maskByteTensor, self.minVals)
+   end
+   return self.output
+end
+
+function CMinTable:updateGradInput(input, gradOutput)
+   for i=1,#input do
+      self.gradInput[i] = self.gradInput[i] or input[i].new()
+      self.gradInput[i]:resizeAs(input[i]):fill(0.0)
+      self.maskByteTensor = self.maskByteTensor or
+         (torch.type(self.output) == 'torch.CudaTensor' and
+         torch.CudaByteTensor() or torch.ByteTensor())
+      self.mask:eq(self.minIdx, i)
+      self.maskByteTensor:resize(self.mask:size()):copy(self.mask)
+      self.gradMaxVals:maskedSelect(gradOutput, self.maskByteTensor)
+      self.gradInput[i]:maskedCopy(self.maskByteTensor, self.gradMaxVals)
+   end
+
+   for i=#input+1, #self.gradInput do
+       self.gradInput[i] = nil
+   end
+
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/CMul.lua b/contrib/lua-torch/nn/CMul.lua
new file mode 100644
index 000000000..890169761
--- /dev/null
+++ b/contrib/lua-torch/nn/CMul.lua
@@ -0,0 +1,166 @@
+local CMul, parent = torch.class('nn.CMul', 'nn.Module')
+
+function CMul:__init(...)
+   parent.__init(self)
+
+   local arg = {...}
+
+   self.size = torch.LongStorage()
+   local n = #arg
+   if n == 1 and torch.type(arg[1]) == 'torch.LongStorage' then
+      self.size:resize(#arg[1]):copy(arg[1])
+   else
+      self.size:resize(n)
+      for i=1,n do
+         self.size[i] = arg[i]
+      end
+   end
+
+   self.weight = torch.Tensor(self.size)
+   self.gradWeight = torch.Tensor(self.size)
+
+   self.output:resize(self.size)
+
+   self:reset()
+end
+
+function CMul:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1./math.sqrt(self.weight:nElement())
+   end
+   self.weight:uniform(-stdv,stdv)
+end
+
+function CMul:updateOutput(input)
+   -- lazy-initialize
+   self._output = self._output or input.new()
+   self._weight = self._weight or input.new()
+   self._expand = self._expand or input.new()
+   self._repeat = self._repeat or input.new()
+
+   self.output:resizeAs(input):copy(input)
+   if input:nElement() == self.weight:nElement() then
+      self._output:view(self.output, -1)
+      self._weight:view(self.weight, -1)
+
+      self._output:cmul(self._weight)
+   else
+      if self.weight:dim() == input:dim() then
+         self._output:set(self.output)
+         self._weight:set(self.weight)
+      else
+         local batchSize = input:size(1)
+         self._output:view(self.output, batchSize, -1)
+         self._weight:view(self.weight, 1, -1)
+      end
+
+      self._expand:expandAs(self._weight, self._output)
+
+      if torch.type(input) == 'torch.CudaTensor' then
+         self._repeat:resizeAs(self._expand):copy(self._expand)
+         self._output:cmul(self._repeat)
+      else
+         self._output:cmul(self._expand)
+      end
+   end
+
+   return self.output
+end
+
+function CMul:updateGradInput(input, gradOutput)
+   if not self.gradInput then
+      return
+   end
+
+   self._gradOutput = self._gradOutput or input.new()
+   self._gradInput = self._gradInput or input.new()
+
+   self.gradInput:resizeAs(input):zero()
+   if self.weight:nElement() == gradOutput:nElement() then
+      self.gradInput:addcmul(1, self.weight, gradOutput)
+   else
+      if self.weight:dim() == input:dim() then
+         nn.utils.contiguousView(self._gradOutput, gradOutput, gradOutput:size())
+         nn.utils.contiguousView(self._gradInput, self.gradInput, self.gradInput:size())
+         self._weight:set(self.weight)
+      else
+         local batchSize = input:size(1)
+         nn.utils.contiguousView(self._gradOutput, gradOutput, batchSize, -1)
+         nn.utils.contiguousView(self._gradInput, self.gradInput, batchSize, -1)
+         self._weight:view(self.weight, 1, -1)
+      end
+
+      self._expand:expandAs(self._weight, self._gradOutput)
+
+      if torch.type(input) == 'torch.CudaTensor' then
+         self._repeat:resizeAs(self._expand):copy(self._expand)
+         self._gradInput:addcmul(1, self._repeat, self._gradOutput)
+      else
+         self._gradInput:addcmul(1, self._expand, self._gradOutput)
+      end
+   end
+
+   return self.gradInput
+end
+
+function CMul:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+
+   self._input = self._input or input.new()
+   self._gradWeight = self._gradWeight or input.new()
+   self._sum = self._sum or input.new()
+
+   if self.weight:nElement() == gradOutput:nElement() then
+      self.gradWeight:addcmul(scale, input, gradOutput)
+   else
+      if self.weight:dim() == input:dim() then
+         nn.utils.contiguousView(self._input, input, input:size())
+         nn.utils.contiguousView(self._gradOutput, gradOutput, gradOutput:size())
+         self._gradWeight:set(self.gradWeight)
+
+         self._repeat:cmul(self._input, self._gradOutput)
+         local sumInto = self._sum
+         local sumFrom = self._repeat
+         for i=1,self.weight:dim() do
+            if self.weight:size(i) ~= input:size(i) then
+               sumInto:sum(sumFrom, i)
+               sumInto = sumFrom
+               sumFrom = sumFrom == self._repeat and self._sum or self._repeat
+            end
+         end
+         self._gradWeight:add(scale, sumFrom)
+      else
+         local batchSize = input:size(1)
+         nn.utils.contiguousView(self._input, input, batchSize, -1)
+         nn.utils.contiguousView(self._gradOutput, gradOutput, batchSize, -1)
+         self._gradWeight:view(self.gradWeight, 1, -1)
+
+         self._repeat:cmul(self._input, self._gradOutput)
+         self._sum:sum(self._repeat, 1)
+         self._gradWeight:add(scale, self._sum)
+      end
+
+   end
+end
+
+function CMul:type(type, tensorCache)
+   if type then
+      self:clearState()
+   end
+   return parent.type(self, type, tensorCache)
+end
+
+function CMul:clearState()
+   nn.utils.clear(self, {
+      '_input',
+      '_output',
+      '_weight',
+      '_gradWeight',
+      '_expand',
+      '_repeat',
+      '_sum',
+   })
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/CMulTable.lua b/contrib/lua-torch/nn/CMulTable.lua
new file mode 100644
index 000000000..b47378e83
--- /dev/null
+++ b/contrib/lua-torch/nn/CMulTable.lua
@@ -0,0 +1,55 @@
+
+local CMulTable, parent = torch.class('nn.CMulTable', 'nn.Module')
+
+function CMulTable:__init()
+   parent.__init(self)
+   self.gradInput = {}
+end
+
+function CMulTable:updateOutput(input)
+   self.output:resizeAs(input[1]):copy(input[1])
+   for i=2,#input do
+      self.output:cmul(input[i])
+   end
+   return self.output
+end
+
+function CMulTable:updateGradInput_efficient(input, gradOutput)
+   self.tout = self.tout or input[1].new()
+   self.tout:resizeAs(self.output)
+   for i=1,#input do
+      self.gradInput[i] = self.gradInput[i] or input[1].new()
+      self.gradInput[i]:resizeAs(input[i]):copy(gradOutput)
+      self.tout:copy(self.output):cdiv(input[i])
+      self.gradInput[i]:cmul(self.tout)
+   end
+
+   for i=#input+1, #self.gradInput do
+       self.gradInput[i] = nil
+   end
+
+   return self.gradInput
+end
+
+function CMulTable:updateGradInput(input, gradOutput)
+   for i=1,#input do
+      self.gradInput[i] = self.gradInput[i] or input[1].new()
+      self.gradInput[i]:resizeAs(input[i]):copy(gradOutput)
+      for j=1,#input do
+         if i~=j then
+            self.gradInput[i]:cmul(input[j])
+         end
+      end
+   end
+
+   for i=#input+1, #self.gradInput do
+       self.gradInput[i] = nil
+   end
+
+   return self.gradInput
+end
+
+function CMulTable:clearState()
+   if self.tout then self.tout:set() end
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/CONTRIBUTING.md b/contrib/lua-torch/nn/CONTRIBUTING.md
new file mode 100644
index 000000000..cc800154e
--- /dev/null
+++ b/contrib/lua-torch/nn/CONTRIBUTING.md
@@ -0,0 +1,136 @@
+# Contributing to Torch7 Core (torch7, nn, cutorch, cunn)
+
+Thanks a lot! There are plenty of ways you can help!
+
+Please take a moment to review this document in order to make the contribution
+process easy and effective for everyone involved.
+
+Following these guidelines helps to communicate that you respect the time of
+the developers managing and developing this open source project. In return,
+they should reciprocate that respect in addressing your issue or assessing
+patches and features.
+
+
+## Using the issue tracker
+
+The [issue tracker](https://github.com/torch/nn/issues) is
+the preferred channel for [bug reports](#bugs), [features requests](#features)
+and [submitting pull requests](#pull-requests), but please respect the following
+restrictions:
+
+* Please **do not** use the issue tracker for personal support requests (use
+  [mailing-list](http://groups.google.com/forum/#!forum/torch7)).
+
+* Please **do not** open issues regarding the code in a torch package
+  outside the core. For example don't open issues about the
+  REPL in the nn issue tracker, use the trepl issue tracker for that.
+
+<a name="bugs"></a>
+## Bug reports
+
+A bug is a _demonstrable problem_ that is caused by the code in the repository.
+Good bug reports are extremely helpful - thank you!
+
+Guidelines for bug reports:
+
+1. **Use the GitHub issue search** &mdash; check if the issue has already been
+   reported.
+
+2. **Check if the issue has been fixed** &mdash; try to reproduce it using the
+   latest `master` or development branch in the repository.
+
+3. **Isolate the problem** &mdash; ideally create test case that is within reason,
+   preferably within 100 lines of code.
+
+A good bug report shouldn't leave others needing to chase you up for more
+information. Please try to be as detailed as possible in your report. What is
+your environment? What steps will reproduce the issue? What OS do you
+experience the problem? What would you expect to be the outcome? All these
+details will help people to fix any potential bugs.
+
+<a name="features"></a>
+## Feature requests
+
+Feature requests are welcome to be filed. Torch is community-developed,
+the maintainers are not exclusive torch developers, so keep that in mind.
+The purpose of feature requests is for others who are looking to implement
+a feature are aware of the interest in the feature.
+
+
+<a name="pull-requests"></a>
+## Pull requests
+
+Good pull requests - patches, improvements, new features - are a fantastic
+help. They should remain focused in scope **and avoid containing unrelated
+commits.**
+
+**Please ask first** before embarking on any significant pull request (e.g.
+implementing features, refactoring code, porting to a different language),
+otherwise you risk spending a lot of time working on something that the
+project's developers might not want to merge into the project.
+
+Please adhere to the coding conventions used throughout a project (indentation,
+accurate comments, etc.) and any other requirements (such as test coverage).
+
+Adhering to the following this process is the best way to get your work
+included in the project:
+
+1. [Fork](https://help.github.com/articles/fork-a-repo) the project, clone your
+   fork, and configure the remotes:
+
+   ```bash
+   # Clone your fork of the repo into the current directory
+   git clone https://github.com/<your-username>/nn.git
+   # Navigate to the newly cloned directory
+   cd nn
+   # Assign the original repo to a remote called "upstream"
+   git remote add upstream https://github.com/torch/nn.git
+   ```
+
+2. If you cloned a while ago, get the latest changes from upstream:
+
+   ```bash
+   git checkout master
+   git pull upstream master
+   ```
+
+3. Create a new topic branch (off the main project development branch) to
+   contain your feature, change, or fix:
+
+   ```bash
+   git checkout -b <topic-branch-name>
+   ```
+
+4. Commit your changes in logical chunks. Please try to adhere to these [git commit
+   message guidelines](http://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html)
+   . Use Git's [interactive rebase](https://help.github.com/articles/about-git-rebase)
+   feature to tidy up your commits before making them public. This helps us keep the
+   commit history in logical blocks and clean, as torch grows.
+   For example:
+     - If you are adding a new function or a module, keep the module + tests + doc
+       to a single commit unless logically warranted.
+     - If you are fixing a bug, keep the bugfix to a single commit unless logically warranted.
+
+5. Locally merge (or rebase) the upstream development branch into your topic branch:
+
+   ```bash
+   git pull [--rebase] upstream master
+   ```
+
+6. Push your topic branch up to your fork:
+
+   ```bash
+   git push origin <topic-branch-name>
+   ```
+
+7. [Open a Pull Request](https://help.github.com/articles/using-pull-requests/)
+    with a clear title and description.
+
+**IMPORTANT**: By submitting a patch, you agree to allow the project owners to
+license your work under the terms of the BSD License.
+
+## Development workflow tips
+
+* While you are changing lua files, one can simply symlink the cloned nn directory to ~/torch/install/share/lua/5.1/nn so that any change is reflected in the current install, without constantly having to do luarocks make rocks/*
+* If you are changing C files, then, after every change, you run luarocks make rocks/*
+* To test, you can just use: th -lnn -e "nn.test()"
diff --git a/contrib/lua-torch/nn/COPYRIGHT.txt b/contrib/lua-torch/nn/COPYRIGHT.txt
new file mode 100644
index 000000000..bc002b78a
--- /dev/null
+++ b/contrib/lua-torch/nn/COPYRIGHT.txt
@@ -0,0 +1,36 @@
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+3. Neither the names of Deepmind Technologies, NYU, NEC Laboratories America
+   and IDIAP Research Institute nor the names of its contributors may be
+   used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
diff --git a/contrib/lua-torch/nn/CReLU.lua b/contrib/lua-torch/nn/CReLU.lua
new file mode 100644
index 000000000..8da6e7974
--- /dev/null
+++ b/contrib/lua-torch/nn/CReLU.lua
@@ -0,0 +1,57 @@
+local CReLU, parent = torch.class('nn.CReLU', 'nn.Sequential')
+
+-- Implements the CReLU activation function as described by
+-- W. Shang et al. in "Understanding and Improving Convolutional Neural Networks
+-- via Concatenated Rectified Linear Units"
+function CReLU:__init(nInputDims, inplace)
+   parent.__init(self)
+   self.nInputDims = nInputDims
+   self.inplace = inplace or false
+
+   local concatTable = nn.ConcatTable()
+   concatTable:add(nn.Identity())
+   concatTable:add(nn.MulConstant(-1))
+   self:add(concatTable)
+   self:add(nn.JoinTable(2))
+   self:add(nn.ReLU(self.inplace))
+end
+
+function CReLU:updateOutput(input)
+   local input_
+   local batched = input:dim() == (self.nInputDims + 1)
+   if not batched then
+      input_ = input:view(1, -1)
+  else
+      input_ = input:view(input:size(1), -1)
+  end
+   parent.updateOutput(self, input_)
+   local osize = input:size()
+   if not batched then
+      osize[1] = osize[1] * 2
+   else
+      osize[2] = osize[2] * 2
+   end
+   self.output:resize(osize)
+   return self.output
+end
+
+function CReLU:backward(input, gradOutput)
+   return self:updateGradInput(input, gradOutput)
+end
+
+function CReLU:updateGradInput(input, gradOutput)
+   local batched = input:dim() == (self.nInputDims + 1)
+   if not batched then
+      parent.updateGradInput(self, input:view(1, -1), gradOutput:view(1, -1))
+   else
+      parent.updateGradInput(self, input:view(input:size(1), -1),
+                                   gradOutput:view(input:size(1), -1))
+   end
+
+   self.gradInput:resizeAs(input)
+   return self.gradInput
+end
+
+function CReLU:__tostring__()
+   return "CReLU()"
+end
diff --git a/contrib/lua-torch/nn/CSubTable.lua b/contrib/lua-torch/nn/CSubTable.lua
new file mode 100644
index 000000000..eb7492055
--- /dev/null
+++ b/contrib/lua-torch/nn/CSubTable.lua
@@ -0,0 +1,26 @@
+
+local CSubTable, parent = torch.class('nn.CSubTable', 'nn.Module')
+
+function CSubTable:__init()
+   parent.__init(self)
+   self.gradInput = {}
+end
+
+function CSubTable:updateOutput(input)
+   self.output:resizeAs(input[1]):copy(input[1])
+   self.output:add(-1,input[2])
+   return self.output
+end
+
+function CSubTable:updateGradInput(input, gradOutput)
+   self.gradInput[1] = self.gradInput[1] or input[1].new()
+   self.gradInput[2] = self.gradInput[2] or input[1].new()
+   self.gradInput[1]:resizeAs(input[1]):copy(gradOutput)
+   self.gradInput[2]:resizeAs(input[2]):copy(gradOutput):mul(-1)
+
+   for i=#input+1, #self.gradInput do
+       self.gradInput[i] = nil
+   end
+
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/Clamp.lua b/contrib/lua-torch/nn/Clamp.lua
new file mode 100644
index 000000000..36397a157
--- /dev/null
+++ b/contrib/lua-torch/nn/Clamp.lua
@@ -0,0 +1,5 @@
+local Clamp, Parent = torch.class('nn.Clamp', 'nn.HardTanh')
+
+function Clamp:__init(min_value, max_value)
+   Parent.__init(self, min_value, max_value)
+end
diff --git a/contrib/lua-torch/nn/ClassNLLCriterion.lua b/contrib/lua-torch/nn/ClassNLLCriterion.lua
new file mode 100644
index 000000000..dae0e6685
--- /dev/null
+++ b/contrib/lua-torch/nn/ClassNLLCriterion.lua
@@ -0,0 +1,82 @@
+local THNN = require 'nn.THNN'
+local ClassNLLCriterion, parent = torch.class('nn.ClassNLLCriterion', 'nn.Criterion')
+
+function ClassNLLCriterion:__init(weights, sizeAverage, ignoreIndex)
+    parent.__init(self)
+    self.sizeAverage = (sizeAverage == nil) and true or sizeAverage
+    self.ignoreIndex = ignoreIndex or -100 -- this target index will be ignored
+    if weights then
+       assert(weights:dim() == 1, "weights input should be 1-D Tensor")
+       self.weights = weights
+    end
+
+    self.output_tensor = torch.zeros(1)
+    self.total_weight_tensor = torch.ones(1)
+    self.target = torch.zeros(1):long()
+end
+
+function ClassNLLCriterion:__len()
+   if (self.weights) then
+      return #self.weights
+   else
+      return 0
+   end
+end
+
+function ClassNLLCriterion:updateOutput(input, target)
+   if type(target) == 'number' then
+      if torch.typename(input):find('torch%.Cuda.*Tensor') then
+          self.target = torch.CudaLongTensor and self.target:cudaLong() or self.target:cuda()
+      else
+          self.target = self.target:long()
+      end
+      self.target:resize(1)
+      self.target[1] = target
+   elseif torch.typename(input):find('torch%.Cuda.*Tensor') then
+      self.target = torch.CudaLongTensor and target:cudaLong() or target
+   else
+      self.target = target:long()
+   end
+
+   input.THNN.ClassNLLCriterion_updateOutput(
+      input:cdata(),
+      self.target:cdata(),
+      self.output_tensor:cdata(),
+      self.sizeAverage,
+      THNN.optionalTensor(self.weights),
+      self.total_weight_tensor:cdata(),
+      self.ignoreIndex
+   )
+   self.output = self.output_tensor[1]
+   return self.output, self.total_weight_tensor[1]
+end
+
+function ClassNLLCriterion:updateGradInput(input, target)
+   if type(target) == 'number' then
+      if torch.typename(input):find('torch%.Cuda.*Tensor') then
+          self.target = torch.CudaLongTensor and self.target:cudaLong() or self.target:cuda()
+      else
+          self.target = self.target:long()
+      end
+      self.target:resize(1)
+      self.target[1] = target
+   elseif torch.typename(input):find('torch%.Cuda.*Tensor') then
+      self.target = torch.CudaLongTensor and target:cudaLong() or target
+   else
+      self.target = target:long()
+   end
+
+   self.gradInput:resizeAs(input):zero()
+
+   input.THNN.ClassNLLCriterion_updateGradInput(
+      input:cdata(),
+      self.target:cdata(),
+      self.gradInput:cdata(),
+      self.sizeAverage,
+      THNN.optionalTensor(self.weights),
+      self.total_weight_tensor:cdata(),
+      self.ignoreIndex
+   )
+
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/ClassSimplexCriterion.lua b/contrib/lua-torch/nn/ClassSimplexCriterion.lua
new file mode 100644
index 000000000..9cabc011f
--- /dev/null
+++ b/contrib/lua-torch/nn/ClassSimplexCriterion.lua
@@ -0,0 +1,118 @@
+local ClassSimplexCriterion, parent
+    = torch.class('nn.ClassSimplexCriterion', 'nn.MSECriterion')
+
+--[[
+    This file implements a criterion for multi-class classification.
+    It learns an embedding per class, where each class' embedding
+    is a point on an (N-1)-dimensional simplex, where N is
+    the number of classes.
+    For example usage of this class, look at doc/criterion.md
+
+    Reference: http://arxiv.org/abs/1506.08230
+
+]]--
+
+
+--[[
+    function regsplex(n):
+    regsplex returns the coordinates of the vertices of a
+    regular simplex centered at the origin.
+    The Euclidean norms of the vectors specifying the vertices are
+    all equal to 1. The input n is the dimension of the vectors;
+    the simplex has n+1 vertices.
+
+    input:
+    n -- dimension of the vectors specifying the vertices of the simplex
+
+    output:
+    a -- tensor dimensioned (n+1,n) whose rows are
+         vectors specifying the vertices
+
+    reference:
+    http://en.wikipedia.org/wiki/Simplex#Cartesian_coordinates_for_regular_n-dimensional_simplex_in_Rn
+--]]
+local function regsplex(n)
+    local a = torch.zeros(n+1,n)
+
+    for k = 1,n do
+        -- determine the last nonzero entry in the vector for the k-th vertex
+        if k==1 then a[k][k] = 1 end
+        if k>1 then a[k][k] = math.sqrt( 1 - a[{ {k},{1,k-1} }]:norm()^2 ) end
+
+        -- fill the k-th coordinates for the vectors of the remaining vertices
+        local c = (a[k][k]^2 - 1 - 1/n) / a[k][k]
+        a[{ {k+1,n+1},{k} }]:fill(c)
+    end
+
+    return a
+end
+
+
+function ClassSimplexCriterion:__init(nClasses)
+    parent.__init(self)
+    assert(nClasses and nClasses > 1 and nClasses == (nClasses -(nClasses % 1)),
+           "Required positive integer argument nClasses > 1")
+    self.nClasses = nClasses
+
+    -- embedding the simplex in a space of dimension strictly greater than
+    -- the minimum possible (nClasses-1) is critical for effective training.
+    local simp = regsplex(nClasses - 1)
+    self.simplex = torch.cat(simp,
+                             torch.zeros(simp:size(1), nClasses -simp:size(2)),
+                             2)
+    self._target = torch.Tensor(nClasses)
+end
+
+-- handle target being both 1D tensor, and
+-- target being 2D tensor (2D tensor means don't do anything)
+local function transformTarget(self, target)
+    if torch.type(target) == 'number' then
+        self._target:resize(self.nClasses)
+        self._target:copy(self.simplex[target])
+    elseif torch.isTensor(target) then
+        assert(target:dim() == 1, '1D tensors only!')
+        local nSamples = target:size(1)
+        self._target:resize(nSamples, self.nClasses)
+        for i=1,nSamples do
+            self._target[i]:copy(self.simplex[target[i]])
+        end
+    end
+end
+
+function ClassSimplexCriterion:updateOutput(input, target)
+    transformTarget(self, target)
+    assert(input:nElement() == self._target:nElement())
+    self.output_tensor = self.output_tensor or input.new(1)
+    input.THNN.MSECriterion_updateOutput(
+      input:cdata(),
+      self._target:cdata(),
+      self.output_tensor:cdata(),
+      self.sizeAverage
+    )
+    self.output = self.output_tensor[1]
+    return self.output
+end
+
+function ClassSimplexCriterion:updateGradInput(input, target)
+    assert(input:nElement() == self._target:nElement())
+    input.THNN.MSECriterion_updateGradInput(
+      input:cdata(),
+      self._target:cdata(),
+      self.gradInput:cdata(),
+      self.sizeAverage
+    )
+    return self.gradInput
+end
+
+function ClassSimplexCriterion:getPredictions(input)
+    if input:dim() == 1 then
+        input = input:view(1, -1)
+    end
+    return torch.mm(input, self.simplex:t())
+end
+
+function ClassSimplexCriterion:getTopPrediction(input)
+    local prod = self:getPredictions(input)
+    local _, maxs = prod:max(prod:nDimension())
+    return maxs:view(-1)
+end
diff --git a/contrib/lua-torch/nn/Collapse.lua b/contrib/lua-torch/nn/Collapse.lua
new file mode 100644
index 000000000..a088608ca
--- /dev/null
+++ b/contrib/lua-torch/nn/Collapse.lua
@@ -0,0 +1,30 @@
+local Collapse, parent = torch.class('nn.Collapse', 'nn.Module')
+
+-- collapses non-batch dims
+function Collapse:__init(nInputDim)
+   parent.__init(self)
+   self.nInputDim = nInputDim
+end
+
+function Collapse:updateOutput(input)
+   if not input:isContiguous() then
+      self._input = self._input or input.new()
+      self._input:resize(input:size()):copy(input)
+      input = self._input
+   end
+   if input:dim() > self.nInputDim then
+      self.output:view(input,input:size(1),-1)
+   else
+      self.output:view(input,-1)
+   end
+   return self.output
+end
+
+function Collapse:updateGradInput(input, gradOutput)
+   self.gradInput:view(gradOutput, input:size())
+   return self.gradInput
+end
+
+function Collapse:clearState()
+   self._input = nil
+end
diff --git a/contrib/lua-torch/nn/Concat.lua b/contrib/lua-torch/nn/Concat.lua
new file mode 100644
index 000000000..d7e3ee711
--- /dev/null
+++ b/contrib/lua-torch/nn/Concat.lua
@@ -0,0 +1,158 @@
+local Concat, parent = torch.class('nn.Concat', 'nn.Container')
+
+function Concat:__init(dimension)
+   parent.__init(self)
+   self.outputSize = torch.LongStorage()
+   self.dimension = dimension
+end
+
+function Concat:updateOutput(input)
+   self.outputSize = self.outputSize or torch.LongStorage()
+
+   local outs = {}
+   for i=1,#self.modules do
+      local currentOutput = self:rethrowErrors(self.modules[i], i, 'updateOutput', input)
+      outs[i] = currentOutput
+      if i == 1 then
+         self.outputSize:resize(currentOutput:dim()):copy(currentOutput:size())
+      else
+         self.outputSize[self.dimension] = self.outputSize[self.dimension] + currentOutput:size(self.dimension)
+      end
+   end
+   self.output:resize(self.outputSize)
+
+   local offset = 1
+   for i,module in ipairs(self.modules) do
+      local currentOutput = outs[i]
+      self.output:narrow(self.dimension, offset, currentOutput:size(self.dimension)):copy(currentOutput)
+      offset = offset + currentOutput:size(self.dimension)
+   end
+   return self.output
+end
+
+local function retable(t1, t2, f)
+   for k, v in ipairs(t2) do
+      if (torch.type(v) == "table") then
+         t1[k] = retable(t1[k] or {}, t2[k], f)
+      else
+         f(t1, k, v)
+      end
+   end
+   for i=#t2+1, #t1 do
+      t1[i] = nil
+   end
+   return t1
+end
+
+local function backward(self, method, input, gradOutput, scale)
+   local isTable = torch.type(input) == 'table'
+   local wasTable = torch.type(self.gradInput) == 'table'
+   scale = scale or 1
+
+   if isTable then
+      local offset = 1
+      for i,module in ipairs(self.modules) do
+         local currentOutput = module.output
+         local currentGradInput = self:rethrowErrors(module, i, method, input,
+                                                     gradOutput:narrow(self.dimension, offset, currentOutput:size(self.dimension)), scale)
+         if torch.type(currentGradInput) ~= 'table' then
+            error"currentGradInput is not a table!"
+         end
+         if #input ~= #currentGradInput then
+            error("table size mismatch: "..#input.." ~= "..#currentGradInput)
+         end
+         if i == 1 then
+            self.gradInput = wasTable and self.gradInput or {}
+            retable(self.gradInput, currentGradInput,
+                    function(t, k, v)
+                       t[k] = t[k] or v:clone()
+                       t[k]:resizeAs(v)
+                       t[k]:copy(v)
+                    end
+            )
+         else
+            retable(self.gradInput, currentGradInput,
+                    function(t, k, v)
+                       if t[k] then
+                          t[k]:add(v)
+                       else
+                          t[k] = v:clone()
+                       end
+                    end
+            )
+         end
+         offset = offset + currentOutput:size(self.dimension)
+      end
+   else
+      self.gradInput = (not wasTable) and self.gradInput:resizeAs(input) or input:clone()
+      local offset = 1
+      for i,module in ipairs(self.modules) do
+         local currentOutput = module.output
+         local currentGradInput = self:rethrowErrors(module, i, method, input,
+                                                     gradOutput:narrow(self.dimension, offset, currentOutput:size(self.dimension)), scale)
+         if currentGradInput then -- if the module does not produce a gradInput (for example first layer), then ignore it and move on.
+            if i==1 then
+               self.gradInput:copy(currentGradInput)
+            else
+               self.gradInput:add(currentGradInput)
+            end
+         end
+         offset = offset + currentOutput:size(self.dimension)
+      end
+   end
+   return self.gradInput
+end
+
+function Concat:updateGradInput(input, gradOutput)
+   return backward(self, 'updateGradInput', input, gradOutput)
+end
+
+function Concat:backward(input, gradOutput, scale)
+   return backward(self, 'backward', input, gradOutput, scale)
+end
+
+function Concat:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   local offset = 1
+   for i,module in ipairs(self.modules) do
+      local currentOutput = module.output
+      self:rethrowErrors(module, i, 'accGradParameters', input,
+          gradOutput:narrow(self.dimension, offset, currentOutput:size(self.dimension)),
+          scale)
+      offset = offset + currentOutput:size(self.dimension)
+   end
+end
+
+function Concat:accUpdateGradParameters(input, gradOutput, lr)
+   local offset = 1
+   for i,module in ipairs(self.modules) do
+      local currentOutput = module.output
+      self:rethrowErrors(module, i, 'accUpdateGradParameters',
+          input,
+          gradOutput:narrow(self.dimension, offset, currentOutput:size(self.dimension)),
+          lr)
+      offset = offset + currentOutput:size(self.dimension)
+   end
+end
+
+function Concat:__tostring__()
+   local tab = '  '
+   local line = '\n'
+   local next = '  |`-> '
+   local lastNext = '   `-> '
+   local ext = '  |    '
+   local extlast = '       '
+   local last = '   ... -> '
+   local str = torch.type(self)
+   str = str .. ' {' .. line .. tab .. 'input'
+   for i=1,#self.modules do
+      if i == #self.modules then
+         str = str .. line .. tab .. lastNext .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab .. extlast)
+      else
+         str = str .. line .. tab .. next .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab .. ext)
+      end
+   end
+   str = str .. line .. tab .. last .. 'output'
+   str = str .. line .. '}'
+   return str
+end
diff --git a/contrib/lua-torch/nn/ConcatTable.lua b/contrib/lua-torch/nn/ConcatTable.lua
new file mode 100644
index 000000000..742719344
--- /dev/null
+++ b/contrib/lua-torch/nn/ConcatTable.lua
@@ -0,0 +1,118 @@
+local ConcatTable, parent = torch.class('nn.ConcatTable', 'nn.Container')
+
+function ConcatTable:__init()
+   parent.__init(self)
+   self.modules = {}
+   self.output = {}
+end
+
+function ConcatTable:updateOutput(input)
+   for i=1,#self.modules do
+      self.output[i] = self:rethrowErrors(self.modules[i], i, 'updateOutput', input)
+   end
+   return self.output
+end
+
+local function retable(t1, t2, f)
+   for k, v in ipairs(t2) do
+      if (torch.type(v) == "table") then
+         t1[k] = retable(t1[k] or {}, t2[k], f)
+      else
+         f(t1, k, v)
+      end
+   end
+   for i=#t2+1, #t1 do
+      t1[i] = nil
+   end
+   return t1
+end
+
+local function backward(self, method, input, gradOutput, scale)
+   local isTable = torch.type(input) == 'table'
+   local wasTable = torch.type(self.gradInput) == 'table'
+   if isTable then
+      for i,module in ipairs(self.modules) do
+         local currentGradInput = self:rethrowErrors(module, i, method, input, gradOutput[i], scale)
+         if torch.type(currentGradInput) ~= 'table' then
+            error"currentGradInput is not a table!"
+         end
+         if #input ~= #currentGradInput then
+            error("table size mismatch: "..#input.." ~= "..#currentGradInput)
+         end
+         if i == 1 then
+            self.gradInput = wasTable and self.gradInput or {}
+            retable(self.gradInput, currentGradInput,
+               function(t, k, v)
+                  t[k] = t[k] or v:clone()
+                  t[k]:resize(v:size())
+                  t[k]:copy(v)
+               end
+            )
+         else
+            retable(self.gradInput, currentGradInput,
+               function(t, k, v)
+                  if t[k] then
+                     t[k]:add(v)
+                  else
+                     t[k] = v:clone()
+                  end
+               end
+            )
+         end
+      end
+   else
+      self.gradInput = (not wasTable) and self.gradInput or input:clone()
+      for i,module in ipairs(self.modules) do
+         local currentGradInput = self:rethrowErrors(module, i, method, input, gradOutput[i], scale)
+         if i == 1 then
+            self.gradInput:resize(currentGradInput:size()):copy(currentGradInput)
+         else
+            self.gradInput:add(currentGradInput)
+         end
+      end
+   end
+   return self.gradInput
+end
+
+function ConcatTable:updateGradInput(input, gradOutput)
+   return backward(self, 'updateGradInput', input, gradOutput)
+end
+
+function ConcatTable:backward(input, gradOutput, scale)
+   return backward(self, 'backward', input, gradOutput, scale)
+end
+
+function ConcatTable:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   for i,module in ipairs(self.modules) do
+      self:rethrowErrors(module, i, 'accGradParameters', input, gradOutput[i], scale)
+   end
+end
+
+function ConcatTable:accUpdateGradParameters(input, gradOutput, lr)
+   for i,module in ipairs(self.modules) do
+      self:rethrowErrors(module, i, 'accUpdateGradParameters', input, gradOutput[i], lr)
+   end
+end
+
+function ConcatTable:__tostring__()
+   local tab = '  '
+   local line = '\n'
+   local next = '  |`-> '
+   local lastNext = '   `-> '
+   local ext = '  |    '
+   local extlast = '       '
+   local last = '   ... -> '
+   local str = torch.type(self)
+   str = str .. ' {' .. line .. tab .. 'input'
+   for i=1,#self.modules do
+      if i == #self.modules then
+         str = str .. line .. tab .. lastNext .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab .. extlast)
+      else
+         str = str .. line .. tab .. next .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab .. ext)
+      end
+   end
+   str = str .. line .. tab .. last .. 'output'
+   str = str .. line .. '}'
+   return str
+end
diff --git a/contrib/lua-torch/nn/Constant.lua b/contrib/lua-torch/nn/Constant.lua
new file mode 100644
index 000000000..07773feb2
--- /dev/null
+++ b/contrib/lua-torch/nn/Constant.lua
@@ -0,0 +1,36 @@
+------------------------------------------------------------------------
+--[[ Constant ]]--
+-- Outputs a constant value given an input.
+-- If nInputDim is specified, uses the input to determine the size of
+-- the batch. The value is then replicated over the batch.
+-- You can use this with nn.ConcatTable() to append constant inputs to
+-- an input : nn.ConcatTable():add(nn.Constant(v)):add(nn.Identity()) .
+------------------------------------------------------------------------
+local Constant, parent = torch.class("nn.Constant", "nn.Module")
+
+function Constant:__init(value, nInputDim)
+   self.value = value
+   if torch.type(self.value) == 'number' then
+      self.value = torch.Tensor{self.value}
+   end
+   assert(torch.isTensor(self.value), "Expecting number or tensor at arg 1")
+   self.nInputDim = nInputDim
+   parent.__init(self)
+end
+
+function Constant:updateOutput(input)
+   if self.nInputDim and input:dim() > self.nInputDim then
+      local vsize = self.value:size():totable()
+      self.output:resize(input:size(1), table.unpack(vsize))
+      local value = self.value:view(1, table.unpack(vsize))
+      self.output:copy(value:expand(self.output:size()))
+   else
+      self.output:resize(self.value:size()):copy(self.value)
+   end
+   return self.output
+end
+
+function Constant:updateGradInput(input, gradOutput)
+   self.gradInput:resizeAs(input):zero()
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/Container.lua b/contrib/lua-torch/nn/Container.lua
new file mode 100644
index 000000000..7e264bab9
--- /dev/null
+++ b/contrib/lua-torch/nn/Container.lua
@@ -0,0 +1,149 @@
+-- This is code common to container modules, which are collections of
+-- smaller constituent modules like Parallel, Sequential, etc.
+local Container, parent = torch.class('nn.Container', 'nn.Module')
+
+function Container:__init(...)
+    parent.__init(self, ...)
+    self.modules = {}
+end
+
+function Container:add(module)
+    table.insert(self.modules, module)
+    return self
+end
+
+function Container:get(index)
+    return self.modules[index]
+end
+
+function Container:size()
+    return #self.modules
+end
+
+-- Check if passing arguments through xpcall is supported in this Lua interpreter.
+local _, XPCALL_ARGS = xpcall(function(x) return x ~= nil end, function() end, 1)
+local TRACEBACK_WARNING = "WARNING: If you see a stack trace below, it doesn't point to the place where this error occurred. Please use only the one above."
+-- module argument can be retrieved with moduleIndex, but code is cleaner when
+-- it has to be specified anyway.
+function Container:rethrowErrors(module, moduleIndex, funcName, ...)
+   assert(module == self.modules[moduleIndex],
+          "mismatch between moduleIndex and self.modules in rethrowErrors")
+   local function handleError(err)
+      -- This will be executed only in the first container that handles the error.
+      if not err:find(TRACEBACK_WARNING) then
+         local traceback = debug.traceback()
+         -- Remove this handler from the stack
+         local _, first_line_end = traceback:find('^.-\n')
+         local _, second_line_end = traceback:find('^.-\n.-\n')
+         traceback = traceback:sub(1, first_line_end) .. traceback:sub(second_line_end+1)
+         err = err .. '\n' .. traceback .. '\n\n' .. TRACEBACK_WARNING
+      else
+         -- Remove file path
+         err = err:sub(err:find('\n')+1)
+      end
+      local msg = string.format('In %d module of %s:',
+                              moduleIndex, torch.type(self))
+      -- Preceding newline has to be here, because Lua will prepend a file path.
+      err = '\n' .. msg .. '\n' .. err
+      return err
+   end
+
+   -- Lua 5.1 doesn't support passing arguments through xpcall, so they have to
+   -- be passed via a closure. This incurs some overhead, so it's better not to
+   -- make it the default.
+   local ok, ret, noret
+   if not XPCALL_ARGS then
+      local args = {...}
+      local unpack = unpack or table.unpack
+      ok, ret, noret = xpcall(function()
+                                 return module[funcName](module, unpack(args))
+                              end,
+                              handleError)
+   else
+      ok, ret, noret = xpcall(module[funcName], handleError, module, ...)
+   end
+   assert(noret == nil, "rethrowErrors supports only one return argument")
+
+   if not ok then error(ret) end
+   return ret
+end
+
+function Container:applyToModules(func)
+    for _, module in ipairs(self.modules) do
+        func(module)
+    end
+end
+
+function Container:zeroGradParameters()
+    self:applyToModules(function(module) module:zeroGradParameters() end)
+end
+
+function Container:updateParameters(learningRate)
+    self:applyToModules(function(module) module:updateParameters(learningRate) end)
+end
+
+function Container:training()
+    self:applyToModules(function(module) module:training() end)
+    parent.training(self)
+end
+
+function Container:evaluate()
+    self:applyToModules(function(module) module:evaluate() end)
+    parent.evaluate(self)
+end
+
+function Container:share(mlp, ...)
+    for i=1,#self.modules do
+        self.modules[i]:share(mlp.modules[i], ...);
+    end
+    return self
+end
+
+function Container:reset(stdv)
+    self:applyToModules(function(module) module:reset(stdv) end)
+end
+
+function Container:parameters()
+    local function tinsert(to, from)
+        if type(from) == 'table' then
+            for i=1,#from do
+                tinsert(to,from[i])
+            end
+        else
+            table.insert(to,from)
+        end
+    end
+    local w = {}
+    local gw = {}
+    for i=1,#self.modules do
+        local mw,mgw = self.modules[i]:parameters()
+        if mw then
+            tinsert(w,mw)
+            tinsert(gw,mgw)
+        end
+    end
+    return w,gw
+end
+
+function Container:clearState()
+   -- don't call set because it might reset referenced tensors
+   local function clear(f)
+      if self[f] then
+         if torch.isTensor(self[f]) then
+            self[f] = self[f].new()
+         elseif type(self[f]) == 'table' then
+            self[f] = {}
+         else
+            self[f] = nil
+         end
+      end
+   end
+   clear('output')
+   clear('gradInput')
+   if self.modules then
+      for i,module in pairs(self.modules) do
+         module:clearState()
+      end
+   end
+   return self
+end
diff --git a/contrib/lua-torch/nn/Contiguous.lua b/contrib/lua-torch/nn/Contiguous.lua
new file mode 100755
index 000000000..f9974ce5a
--- /dev/null
+++ b/contrib/lua-torch/nn/Contiguous.lua
@@ -0,0 +1,21 @@
+local Contiguous, parent = torch.class('nn.Contiguous', 'nn.Module')
+
+function Contiguous:updateOutput(input)
+   if not input:isContiguous() then
+      if self.output:storage() == input:storage() then self.output:set() end
+      self.output:resizeAs(input):copy(input)
+   else
+      self.output:set(input)
+   end
+   return self.output
+end
+
+function Contiguous:updateGradInput(input, gradOutput)
+   if not gradOutput:isContiguous() then
+      if self.gradInput:storage() == gradOutput:storage() then self.gradInput:set() end
+      self.gradInput:resizeAs(gradOutput):copy(gradOutput)
+   else
+      self.gradInput:set(gradOutput)
+   end
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/Convert.lua b/contrib/lua-torch/nn/Convert.lua
new file mode 100644
index 000000000..855338dd6
--- /dev/null
+++ b/contrib/lua-torch/nn/Convert.lua
@@ -0,0 +1,245 @@
+------------------------------------------------------------------------
+--[ nn.Convert ]--
+-- Module to convert between different data formats
+-- nn.Convert('bchw', 'bf') or nn.Convert('chw', 'f')
+-- Automatically converts input to same type as self.output
+-- Simplest use is for automatic input type converions : nn.Convert()
+------------------------------------------------------------------------
+local _ = require 'moses'
+local Convert, parent = torch.class("nn.Convert", "nn.Container")
+
+function Convert:__init(inputShape, outputShape)
+   if outputShape and not inputShape then
+      error"Expecting non-nil arg 1 when arg 2 is provided"
+   end
+   inputShape = inputShape or 'b*'
+   outputShape = outputShape or inputShape
+   self.inputShape = inputShape:find('b') and inputShape or ('b'..inputShape)
+   self.outputShape = outputShape:find('b') and outputShape or ('b'..outputShape)
+   self.inputBatchDim = self.inputShape:find('b')
+   self.outputBatchDim = self.outputShape:find('b')
+   if self.inputShape == 'b*' or self.outputShape == 'b*' then
+      assert(self.inputShape == 'b*' and self.outputShape == 'b*', 'Both or neither shapes must be b*')
+      self.nInputDim = -1
+      self.nOutputDim = -1
+      self.transposition = true
+   else
+      -- number of dims in batch mode
+      self.nInputDim = #self.inputShape
+      self.nOutputDim = #self.outputShape
+      -- is the outputShape just a transposition of the inputShape?
+      if self.nInputDim == self.nOutputDim then
+         self.transposition = true
+         for i=1,self.nInputDim do
+            if not self.outputShape:find(self.inputShape:sub(i,i)) then
+               self.transposition = false
+               break
+            end
+         end
+      end
+   end
+   parent.__init(self)
+end
+
+-- post-initialization
+function Convert:buildConverter(input)
+   if self.transposition then
+      self.converter = self:transpose(self.outputShape)
+   else
+      if (torch.type(self[self.outputShape]) ~= 'function') then
+         error(string.format("Unrecognized conversion of shape %s to %s", self.inputShape, self.outputShape))
+      end
+      self.converter = self[self.outputShape](self, input)
+   end
+   assert(torch.isTensor(self.output), "Expecting Tensor output")
+
+   self.converter:type(torch.type(self.output))
+
+   self.modules[1] = self.converter
+end
+
+function Convert:updateOutput(input)
+   assert(torch.isTensor(input), "expecting Tensor")
+   if not torch.isTypeOf(input, torch.type(self.output)) then
+      -- handle different input type
+      self._input = self._input or self.output.new()
+      self._input:resize(input:size()):copy(input)
+      input = self._input
+   end
+   self.batchMode = true
+   if input:dim() < self.nInputDim then
+      -- handle non-batch mode
+      local inputSize = input:size():totable()
+      table.insert(inputSize, self.inputBatchDim, 1)
+      self.__input = self.__input or input.new()
+      self.__input:set(input):resize(table.unpack(inputSize))
+      input = self.__input
+      self.batchMode = false
+   end
+   if not self.converter then
+      self:buildConverter(input)
+   end
+
+   self.output = self.converter:updateOutput(input)
+
+   if not self.batchMode then
+      local outputSize = self.output:size():totable()
+      table.remove(outputSize, self.outputBatchDim)
+      self.__output = self.__output or self.output.new()
+      self.__output:set(self.output):resize(table.unpack(outputSize))
+      self.output = self.__output
+   end
+   return self.output
+end
+
+function Convert:updateGradInput(input, gradOutput)
+   local input_ = input
+   input = self._input or input
+   if not self.batchMode then
+      input = self.__input
+      self.__gradOutput = self.__gradOutput or gradOutput.new()
+      self.__gradOutput:set(gradOutput):resize(self.converter.output:size())
+      gradOutput = self.__gradOutput
+   end
+
+   local gradInput = self.converter:updateGradInput(input, gradOutput)
+
+   if not self.batchMode then
+      self.__gradInput = self.__gradInput or gradInput.new()
+      self.__gradInput:set(gradInput):resize(input_:size())
+      gradInput = self.__gradInput
+   end
+   if self._input then
+      self._gradInput = self._gradInput or input.new()
+      self._gradInput:resize(input:size()):copy(gradInput)
+      self.gradInput = self._gradInput
+   else
+      self.gradInput = gradInput
+   end
+
+   return self.gradInput
+end
+
+function Convert:accGradParameters(input, gradOutput, scale)
+   input = self.batchMode and self.__input or self._input or input
+   gradOutput = self.batchMode and self.__gradOutput or gradOutput
+   self.converter:accGradParameters(input, gradOutput, scale)
+end
+
+function Convert:accUpdateGradParameters(input, gradOutput, lr)
+   input = self.batchMode and self.__input or self._input or input
+   gradOutput = self.batchMode and self.__gradOutput or gradOutput
+   self.converter:accUpdateGradParameters(input, gradOutput, lr)
+end
+
+-- batch feature
+function Convert:bf(input)
+   local b_pos = self:findAxis('b', self.inputShape)
+   local dim = #self.inputShape
+   if self.inputShape == 'bt' then
+      error"Conversion of shape bt to bf not supported: open an issue on github"
+   end
+   -- was b
+   if dim == 1 then
+      return nn.Reshape(1)
+   end
+   -- was b...
+   local modula
+   if b_pos ~= 1 then
+      modula = nn.Transpose({1, b_pos})
+   end
+   if dim > 2 then
+      local transpose = modula
+      local sampleSize = input:select(self:findAxis('b'),1):nElement()
+      local reshape = nn.Reshape(sampleSize)
+      if transpose then
+         modula = nn.Sequential()
+         modula:add(transpose)
+         modula:add(reshape)
+      else
+         modula = reshape
+      end
+   end
+   return modula or nn.Identity()
+end
+
+-- each example is a scalar; batch is a vector
+function Convert:b(input)
+   local b_pos = self:findAxis('b')
+   if self.inputShape == 'bt' or self.inputShape == 'tb' then
+      local t_pos = self:findAxis('t')
+      -- select first set of classes
+      return nn.Select(t_pos, 1)
+   elseif self.inputShape == 'bf' or self.inputShape == 'fb' then
+      -- this wont work as expected with size(f) > 1
+      local f_pos = self:findAxis('f')
+      if input:size(f_pos) > 1 then
+         error("Cannot convert shape "..self.inputShape.." to b when feature > 1")
+      end
+      return nn.Select(f_pos, 1)
+   else
+      error("Cannot convert shape "..self.inputShape.." to shape b")
+   end
+end
+
+-- returns the current shape of the data
+function Convert:default()
+   return nn.Identity()
+end
+
+-- multi-class (batch target)
+function Convert:bt()
+   local b_pos = self:findAxis('b')
+   local modula
+   if self.inputShape == 'b' then
+      modula = nn.Reshape(1)
+   else
+      error("cannot convert shape '"..self.inputShape.."' to bt")
+   end
+   return modula
+end
+
+-- a generic function for transposing shape axes
+function Convert:transpose(newShape)
+   if newShape == self.inputShape then
+      return nn.Identity()
+   end
+   local inputShape = {}
+   for i=1,#self.inputShape do
+      table.insert(inputShape, self.inputShape:sub(i,i))
+   end
+   local transpositions = {}
+   for i=1,#newShape do
+      local j = _.indexOf(inputShape, newShape:sub(i,i))
+      if i ~= j then
+         local char = inputShape[i]
+         inputShape[i] = inputShape[j]
+         inputShape[j] = char
+         table.insert(transpositions, {j, i})
+      end
+   end
+   return nn.Transpose(table.unpack(transpositions))
+end
+
+function Convert:findAxis(axis_char, shape, silent)
+   shape = shape or self.inputShape
+   local axis_pos = shape:find(axis_char)
+   if (not silent) and (not axis_pos) then
+      error("Provided shape '"..shape.."' has no axis '"..axis_char.."'", 2)
+   end
+   return axis_pos
+end
+
+function Convert:clearState()
+   self._input = nil
+   self._gradInput = nil
+   self.__input = nil
+   self.__output = nil
+   self.__gradInput = nil
+   self.__gradOutput =  nil
+end
+
+function Convert:type(type)
+   self:clearState()
+   return parent.type(self, type)
+end
diff --git a/contrib/lua-torch/nn/Copy.lua b/contrib/lua-torch/nn/Copy.lua
new file mode 100644
index 000000000..9f83cf9b4
--- /dev/null
+++ b/contrib/lua-torch/nn/Copy.lua
@@ -0,0 +1,42 @@
+local Copy, parent = torch.class('nn.Copy', 'nn.Module')
+
+function Copy:__init(intype, outtype, forceCopy, dontCast)
+   intype = intype or torch.Tensor.__typename
+   outtype = outtype or torch.Tensor.__typename
+
+   self.dontCast = dontCast
+
+   parent.__init(self)
+   self.gradInput = torch.getmetatable(intype).new()
+   self.output = torch.getmetatable(outtype).new()
+
+   if (not forceCopy) and intype == outtype then
+
+      self.updateOutput = function(self, input)
+                        self.output:set(input)
+                        return input
+                     end
+
+      self.updateGradInput = function(self, input, gradOutput)
+                         self.gradInput:set(gradOutput)
+                         return gradOutput
+                      end
+   end
+end
+
+function Copy:updateOutput(input)
+   self.output:resize(input:size()):copy(input)
+   return self.output
+end
+
+function Copy:updateGradInput(input, gradOutput)
+   self.gradInput:resize(gradOutput:size()):copy(gradOutput)
+   return self.gradInput
+end
+
+function Copy:type(type, tensorCache)
+   if type and self.dontCast then
+      return self
+   end
+   return parent.type(self, type, tensorCache)
+end
diff --git a/contrib/lua-torch/nn/Cosine.lua b/contrib/lua-torch/nn/Cosine.lua
new file mode 100644
index 000000000..19a9cba82
--- /dev/null
+++ b/contrib/lua-torch/nn/Cosine.lua
@@ -0,0 +1,175 @@
+local Cosine, parent = torch.class('nn.Cosine', 'nn.Module')
+
+function Cosine:__init(inputSize,outputSize)
+   parent.__init(self)
+
+   self.weight = torch.Tensor(outputSize,inputSize)
+   self.gradWeight = torch.Tensor(outputSize,inputSize)
+
+   self:reset()
+end
+
+function Cosine:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1./math.sqrt(self.weight:size(1))
+   end
+   self.weight:uniform(-stdv, stdv)
+end
+
+function Cosine:updateOutput(input)
+   local inputSize = self.weight:size(2)
+   local outputSize = self.weight:size(1)
+
+   self._weightNorm = self._weightNorm or self.weight.new()
+   self._inputNorm = self._inputNorm or self.weight.new()
+
+   -- y_j = (w_j * x) / ( || w_j || * || x || )
+
+   self._weightNorm:norm(self.weight,2,2):add(1e-12)
+   if input:dim() == 1 then
+      self.output:resize(outputSize):zero()
+      self.output:addmv(1, self.weight, input)
+      self.__norm = input:norm()+1e-12
+      self.output:cdiv(self._weightNorm:view(outputSize)):div(self.__norm)
+   elseif input:dim() == 2 then
+      local batchSize = input:size(1)
+      local nElement = self.output:nElement()
+      self.output:resize(batchSize, outputSize)
+      if self.output:nElement() ~= nElement then
+         self.output:zero()
+      end
+      self.output:addmm(0, self.output, 1, input, self.weight:t())
+
+      self._inputNorm:norm(input,2,2):add(1e-12)
+      self.output:cdiv(self._weightNorm:view(1,outputSize):expandAs(self.output))
+      self.output:cdiv(self._inputNorm:expandAs(self.output))
+   else
+      error('input must be vector or matrix')
+   end
+
+   return self.output
+end
+
+function Cosine:updateGradInput(input, gradOutput)
+   if not self.gradInput then
+      return
+   end
+
+   local inputSize = self.weight:size(2)
+   local outputSize = self.weight:size(1)
+
+   --[[
+   dy_j           w_ji                   x_i
+   ---- = -------------------  -  y_j ---------
+   dx_i   || w_j || * || x ||         || x ||^2
+   --]]
+
+   local nElement = self.gradInput:nElement()
+   self.gradInput:resizeAs(input)
+   if self.gradInput:nElement() ~= nElement then
+      self.gradInput:zero()
+   end
+
+   if input:dim() == 1 then
+      self._weight = self._weight or input.new()
+      self._weight:resizeAs(self.weight):copy(self.weight)
+      self._weight:cdiv(self._weightNorm:expandAs(self.weight))
+      self._weight:div(self.__norm)
+      self._weight:addr(1, self._weight, -1/(self.__norm*self.__norm), self.output, input)
+      self.gradInput:addmv(0, 1, self._weight:t(), gradOutput)
+   elseif input:dim() == 2 then
+      local inputNorm = self._inputNorm:expandAs(input)
+      local weightNorm = self._weightNorm:view(1,outputSize):expandAs(gradOutput)
+
+      self.gradInput:copy(input):cdiv(inputNorm)
+      self._gradOutput = self._gradOutput or gradOutput.new()
+      self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
+      self._gradOutput:cmul(self.output)
+      self._sum = self._sum or input.new()
+      self._sum:sum(self._gradOutput, 2)
+      self.gradInput:cmul(self._sum:expandAs(input))
+
+      self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
+      self._gradOutput:cdiv(weightNorm)
+      self.gradInput:addmm(-1, self.gradInput, 1, self._gradOutput, self.weight)
+
+      self.gradInput:cdiv(inputNorm)
+   end
+
+   return self.gradInput
+end
+
+function Cosine:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   local inputSize = self.weight:size(2)
+   local outputSize = self.weight:size(1)
+
+   --[[
+   dy_j            x_i                     w_ji
+   ----- = -------------------  -  y_j -----------
+   dw_ji   || w_j || * || x ||         || w_j ||^2
+   --]]
+
+   if input:dim() == 1 then
+      self._gradOutput = self._gradOutput or gradOutput.new()
+      self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
+      local weightNorm = self._weightNorm:view(outputSize)
+      self._gradOutput:cdiv(weightNorm)
+      self.gradWeight:addr(scale/self.__norm, self._gradOutput, input)
+
+      self._gradOutput:cdiv(weightNorm)
+      self._gradOutput:cmul(self.output)
+      self._weight = self._weight or self.weight.new()
+      self._weight:resizeAs(self._weight):copy(self.weight)
+      self._weight:cmul(self._gradOutput:view(outputSize, 1):expandAs(self.weight))
+      self.gradWeight:add(-1, self._weight)
+   elseif input:dim() == 2 then
+      self._weight = self._weight or self.weight.new()
+      self._weight:resizeAs(self.weight):copy(self.weight)
+      self._gradOutput = self._gradOutput or gradOutput.new()
+      self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
+      self._gradOutput:cmul(self.output)
+      self._sum = self._sum or input.new()
+      self._sum:sum(self._gradOutput, 1)
+      local grad = self._sum[1]
+      grad:cdiv(self._weightNorm:select(2,1))
+      self._weight:cmul(grad:view(outputSize,1):expandAs(self._weight))
+
+      local input_ = self._gradOutput
+      input_:resizeAs(input):copy(input)
+      input_:cdiv(self._inputNorm:expandAs(input))
+      self._weight:addmm(-1, self._weight, 1, gradOutput:t(), input_)
+
+      self._weight:cdiv(self._weightNorm:expandAs(self._weight))
+      self.gradWeight:add(self._weight)
+   else
+      error"1D or 2D input expected"
+   end
+end
+
+function Cosine:type(type, tensorCache)
+   if type then
+      -- prevent premature memory allocations
+      self._input = nil
+      self._weight = nil
+      self._inputNorm = nil
+      self._weightNorm = nil
+      self._gradOutput = nil
+      self._sum = nil
+   end
+   return parent.type(self, type, tensorCache)
+end
+
+function Cosine:clearState()
+   nn.utils.clear(self, {
+      '_input',
+      '_weight',
+      '_gradOutput',
+      '_sum',
+      '_inputNorm',
+      '_weightNorm',
+   })
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/CosineDistance.lua b/contrib/lua-torch/nn/CosineDistance.lua
new file mode 100644
index 000000000..fe4e4b9f5
--- /dev/null
+++ b/contrib/lua-torch/nn/CosineDistance.lua
@@ -0,0 +1,116 @@
+local CosineDistance, parent = torch.class('nn.CosineDistance', 'nn.Module')
+
+function CosineDistance:__init()
+   parent.__init(self)
+   self.gradInput = {torch.Tensor(), torch.Tensor()}
+end
+
+local function makeContiguous(self, input1, input2)
+   if not input1:isContiguous() then
+      self._input1 = self._input1 or input1.new()
+      self._input1:resizeAs(input1):copy(input1)
+      input1 = self._input1
+   end
+   if not input2:isContiguous() then
+      self._input2 = self._input2 or input2.new()
+      self._input2:resizeAs(input2):copy(input2)
+      input2 = self._input2
+   end
+   return input1, input2
+end
+
+function CosineDistance:updateOutput(input)
+   local input1, input2 = input[1], input[2]
+
+   input1, input2 = makeContiguous(self, input1, input2)
+
+   if input1:dim() == 1 then
+      input1 = input1:view(1,-1)
+      input2 = input2:view(1,-1)
+   end
+
+   if not self.buffer then
+      self.buffer = input1.new()
+      self.w1  = input1.new()
+      self.w22 = input1.new()
+      self.w  = input1.new()
+      self.w32 = input1.new()
+      self.ones = input1.new()
+   end
+
+   self.buffer:cmul(input1,input2)
+   self.w1:sum(self.buffer,2)
+
+   local epsilon = 1e-12
+   self.buffer:cmul(input1,input1)
+   self.w22:sum(self.buffer,2):add(epsilon)
+   self.ones:resizeAs(self.w22):fill(1)
+   self.w22:cdiv(self.ones, self.w22)
+   self.w:resizeAs(self.w22):copy(self.w22)
+
+   self.buffer:cmul(input2,input2)
+   self.w32:sum(self.buffer,2):add(epsilon)
+   self.w32:cdiv(self.ones, self.w32)
+   self.w:cmul(self.w32)
+   self.w:sqrt()
+
+   self.output:cmul(self.w1,self.w)
+   self.output:resize(input1:size(1))
+
+   return self.output
+end
+
+function CosineDistance:updateGradInput(input, gradOutput)
+   local v1  = input[1]
+   local v2  = input[2]
+   local not_batch = false
+
+   v1, v2 = makeContiguous(self, v1, v2)
+
+   if v1:dim() == 1 then
+      v1 = v1:view(1,-1)
+      v2 = v2:view(1,-1)
+      not_batch = true
+   end
+
+   if #self.gradInput ~= 2 then
+      self.gradInput[1] = self.gradInput[1] or v1.new()
+      self.gradInput[2] = self.gradInput[2] or v1.new()
+   end
+
+   local gw1 = self.gradInput[1]
+   local gw2 = self.gradInput[2]
+   gw1:resizeAs(v1):copy(v2)
+   gw2:resizeAs(v1):copy(v1)
+
+   self.buffer:cmul(self.w1,self.w22)
+   gw1:addcmul(-1,self.buffer:expandAs(v1),v1)
+   gw1:cmul(self.w:expandAs(v1))
+
+   self.buffer:cmul(self.w1,self.w32)
+   gw2:addcmul(-1,self.buffer:expandAs(v1),v2)
+   gw2:cmul(self.w:expandAs(v1))
+
+   local go = gradOutput:view(-1,1):expandAs(v1)
+   gw1:cmul(go)
+   gw2:cmul(go)
+
+   if not_batch then
+      self.gradInput[1]:resize(gw1:size(2))
+      self.gradInput[2]:resize(gw2:size(2))
+   end
+
+   return self.gradInput
+end
+
+function CosineDistance:clearState()
+   nn.utils.clear(self, {
+      'buffer',
+      'w1',
+      'w22',
+      'w',
+      'w32',
+      'ones',
+   })
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/CosineEmbeddingCriterion.lua b/contrib/lua-torch/nn/CosineEmbeddingCriterion.lua
new file mode 100644
index 000000000..d55e03130
--- /dev/null
+++ b/contrib/lua-torch/nn/CosineEmbeddingCriterion.lua
@@ -0,0 +1,142 @@
+local CosineEmbeddingCriterion, parent = torch.class('nn.CosineEmbeddingCriterion', 'nn.Criterion')
+
+function CosineEmbeddingCriterion:__init(margin)
+   parent.__init(self)
+   margin = margin or 0
+   self.margin = margin
+   self.gradInput = {torch.Tensor(), torch.Tensor()}
+   self.sizeAverage = true
+end
+
+function CosineEmbeddingCriterion:updateOutput(input,y)
+
+   local input1, input2 = input[1], input[2]
+
+   -- keep backward compatibility
+   if type(y) == 'number' then
+     self._y = self._y or input1.new(1)
+     self._y[1] = y
+     y = self._y
+   end
+
+   if input1:dim() == 1 then
+      input1 = input1:view(1,-1)
+      input2 = input2:view(1,-1)
+   end
+
+   if not self.buffer then
+      self.buffer = input1.new()
+      self.w1  = input1.new()
+      self.w22 = input1.new()
+      self.w  = input1.new()
+      self.w32 = input1.new()
+      self._outputs = input1.new()
+      -- comparison operators behave differently from cuda/c implementations
+      if input1:type() == 'torch.CudaTensor' then
+         self._idx = input1.new()
+      else
+         self._idx = torch.ByteTensor()
+      end
+   end
+
+   self.buffer:cmul(input1,input2)
+   self.w1:sum(self.buffer,2)
+
+   local epsilon = 1e-12
+   self.buffer:cmul(input1,input1)
+   self.w22:sum(self.buffer,2):add(epsilon)
+   -- self._outputs is also used as a temporary buffer
+   self._outputs:resizeAs(self.w22):fill(1)
+   self.w22:cdiv(self._outputs, self.w22)
+   self.w:resizeAs(self.w22):copy(self.w22)
+
+   self.buffer:cmul(input2,input2)
+   self.w32:sum(self.buffer,2):add(epsilon)
+   self.w32:cdiv(self._outputs, self.w32)
+   self.w:cmul(self.w32)
+   self.w:sqrt()
+
+   self._outputs:cmul(self.w1,self.w)
+   self._outputs = self._outputs:select(2,1)
+
+   y.eq(self._idx,y,-1)
+   self._outputs[self._idx] = self._outputs[self._idx]:add(-self.margin):cmax(0)
+   y.eq(self._idx,y,1)
+   self._outputs[self._idx] = self._outputs[self._idx]:mul(-1):add(1)
+
+   self.output = self._outputs:sum()
+
+   if self.sizeAverage then
+      self.output = self.output/y:size(1)
+   end
+
+   return self.output
+end
+
+function CosineEmbeddingCriterion:updateGradInput(input, y)
+
+   local v1  = input[1]
+   local v2  = input[2]
+   local not_batch = false
+
+   -- keep backward compatibility
+   if type(y) == 'number' then
+     self._y = self._y or input1.new(1)
+     self._y[1] = y
+     y = self._y
+   end
+
+   if v1:dim() == 1 then
+      v1 = v1:view(1,-1)
+      v2 = v2:view(1,-1)
+      not_batch = true
+   end
+
+   local gw1 = self.gradInput[1]
+   local gw2 = self.gradInput[2]
+   gw1:resizeAs(v1):copy(v2)
+   gw2:resizeAs(v1):copy(v1)
+
+   self.buffer:cmul(self.w1,self.w22)
+   gw1:addcmul(-1,self.buffer:expandAs(v1),v1)
+   gw1:cmul(self.w:expandAs(v1))
+
+   self.buffer:cmul(self.w1,self.w32)
+   gw2:addcmul(-1,self.buffer:expandAs(v1),v2)
+   gw2:cmul(self.w:expandAs(v1))
+
+   -- self._idx = self._outputs <= 0
+   y.le(self._idx,self._outputs,0)
+   self._idx = self._idx:view(-1,1):expand(gw1:size())
+   gw1[self._idx] = 0
+   gw2[self._idx] = 0
+
+   y.eq(self._idx,y,1)
+   self._idx = self._idx:view(-1,1):expand(gw2:size())
+   gw1[self._idx] = gw1[self._idx]:mul(-1)
+   gw2[self._idx] = gw2[self._idx]:mul(-1)
+
+   if self.sizeAverage then
+      gw1:div(y:size(1))
+      gw2:div(y:size(1))
+   end
+
+   if not_batch then
+      self.gradInput[1]:resize(gw1:size(2))
+      self.gradInput[2]:resize(gw2:size(2))
+   end
+
+   return self.gradInput
+end
+
+function CosineEmbeddingCriterion:type(type)
+   self._idx = nil
+   parent.type(self,type)
+   -- comparison operators behave differently from cuda/c implementations
+   if type == 'torch.CudaTensor' then
+      self._idx = torch.CudaTensor()
+   else
+      self._idx = torch.ByteTensor()
+   end
+   return self
+end
diff --git a/contrib/lua-torch/nn/Criterion.lua b/contrib/lua-torch/nn/Criterion.lua
new file mode 100644
index 000000000..e48f06876
--- /dev/null
+++ b/contrib/lua-torch/nn/Criterion.lua
@@ -0,0 +1,64 @@
+local Criterion = torch.class('nn.Criterion')
+
+function Criterion:__init()
+   self.gradInput = torch.Tensor()
+   self.output = 0
+end
+
+function Criterion:updateOutput(input, target)
+end
+
+function Criterion:forward(input, target)
+   return self:updateOutput(input, target)
+end
+
+function Criterion:backward(input, target)
+   return self:updateGradInput(input, target)
+end
+
+function Criterion:updateGradInput(input, target)
+end
+
+function Criterion:clone()
+   local f = torch.MemoryFile("rw"):binary()
+   f:writeObject(self)
+   f:seek(1)
+   local clone = f:readObject()
+   f:close()
+   return clone
+end
+
+function Criterion:type(type, tensorCache)
+   assert(type, 'Criterion: must provide a type to convert to')
+   -- find all tensors and convert them
+   for key,param in pairs(self) do
+      self[key] = nn.utils.recursiveType(param, type, tensorCache)
+   end
+   return self
+end
+
+function Criterion:float()
+   return self:type('torch.FloatTensor')
+end
+
+function Criterion:double()
+   return self:type('torch.DoubleTensor')
+end
+
+function Criterion:cuda()
+   return self:type('torch.CudaTensor')
+end
+
+function Criterion:cudaHalf()
+   return self:type('torch.CudaHalfTensor')
+end
+
+function Criterion:cudaDouble()
+   return self:type('torch.CudaDoubleTensor')
+end
+
+function Criterion:__call__(input, target)
+   self.output = self:forward(input, target)
+   self.gradInput = self:backward(input, target)
+   return self.output, self.gradInput
+end
diff --git a/contrib/lua-torch/nn/CriterionTable.lua b/contrib/lua-torch/nn/CriterionTable.lua
new file mode 100644
index 000000000..14f67bd39
--- /dev/null
+++ b/contrib/lua-torch/nn/CriterionTable.lua
@@ -0,0 +1,17 @@
+local CriterionTable, parent = torch.class('nn.CriterionTable', 'nn.Module')
+
+function CriterionTable:__init(criterion)
+   parent.__init(self)
+   self.criterion = criterion
+   self.gradInput = {criterion.gradInput}
+end
+
+function CriterionTable:updateOutput(input)
+   self.output = self.criterion:updateOutput(table.unpack(input))
+   return self.output
+end
+
+function CriterionTable:updateGradInput(input, gradOutput)
+  self.criterion:updateGradInput(table.unpack(input))
+  return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/CrossEntropyCriterion.lua b/contrib/lua-torch/nn/CrossEntropyCriterion.lua
new file mode 100644
index 000000000..2f72cf87f
--- /dev/null
+++ b/contrib/lua-torch/nn/CrossEntropyCriterion.lua
@@ -0,0 +1,42 @@
+local CrossEntropyCriterion, Criterion = torch.class('nn.CrossEntropyCriterion', 'nn.Criterion')
+
+function CrossEntropyCriterion:__init(weights, sizeAverage)
+   Criterion.__init(self)
+   self.lsm = nn.LogSoftMax()
+   self.nll = nn.ClassNLLCriterion(weights, sizeAverage)
+   self.sizeAverage = self.nll.sizeAverage
+   self.oldSizeAverage = self.sizeAverage
+end
+
+function CrossEntropyCriterion:updateOutput(input, target)
+   input = input:squeeze()
+   target = type(target) == 'number' and target or target:squeeze()
+   -- only propagate if value has changed to preserve old behavior
+   -- of setting nll.sizeAverage directly
+   if self.sizeAverage ~= self.oldSizeAverage then
+      self.nll.sizeAverage = self.sizeAverage
+   end
+   self.lsm:updateOutput(input)
+   self.nll:updateOutput(self.lsm.output, target)
+   self.output = self.nll.output
+   self.oldSizeAverage = self.sizeAverage
+   return self.output
+end
+
+function CrossEntropyCriterion:updateGradInput(input, target)
+   local size = input:size()
+   input = input:squeeze()
+   target = type(target) == 'number' and target or target:squeeze()
+   -- only propagate if  value has changed to preserve old behavior
+   -- of setting nll.sizeAverage directly
+   if self.sizeAverage ~= self.oldSizeAverage then
+      self.nll.sizeAverage = self.sizeAverage
+   end
+   self.nll:updateGradInput(self.lsm.output, target)
+   self.lsm:updateGradInput(input, self.nll.gradInput)
+   self.gradInput:view(self.lsm.gradInput, size)
+   self.oldSizeAverage = self.sizeAverage
+   return self.gradInput
+end
+
+return nn.CrossEntropyCriterion
diff --git a/contrib/lua-torch/nn/Decorator.lua b/contrib/lua-torch/nn/Decorator.lua
new file mode 100644
index 000000000..05fb4db92
--- /dev/null
+++ b/contrib/lua-torch/nn/Decorator.lua
@@ -0,0 +1,47 @@
+local Decorator, parent = torch.class("nn.Decorator", "nn.Container")
+
+function Decorator:__init(module)
+   parent.__init(self)
+   -- so that it can be handled like a Container
+   self.modules[1] = module
+end
+
+function Decorator:updateOutput(input)
+   self.output = self.modules[1]:updateOutput(input)
+   return self.output
+end
+
+function Decorator:updateGradInput(input, gradOutput)
+   self.gradInput = self.modules[1]:updateGradInput(input, gradOutput)
+   return self.gradInput
+end
+
+function Decorator:accGradParameters(input, gradOutput, scale)
+   self.modules[1]:accGradParameters(input, gradOutput, scale)
+end
+
+function Decorator:accUpdateGradParameters(input, gradOutput, lr)
+   self.modules[1]:accUpdateGradParameters(input, gradOutput, lr)
+end
+
+function Decorator:sharedAccUpdateGradParameters(input, gradOutput, lr)
+   self.modules[1]:sharedAccUpdateGradParameters(input, gradOutput, lr)
+end
+
+function Decorator:__tostring__()
+   if self.modules[1].__tostring__ then
+      return torch.type(self) .. ' @ ' .. self.modules[1]:__tostring__()
+   else
+      return torch.type(self) .. ' @ ' .. torch.type(self.modules[1])
+   end
+end
+
+-- useful for multiple-inheritance
+function Decorator.decorate(class)
+   class.updateOutput = nn.Decorator.updateOutput
+   class.updateGradInput = nn.Decorator.updateGradInput
+   class.accGradParameters = nn.Decorator.accGradParameters
+   class.accUpdateGradParameters = nn.Decorator.accUpdateGradParameters
+   class.sharedAccUpdateGradParameters = nn.Decorator.sharedAccUpdateGradParameters
+   class.__tostring__ =  nn.Decorator.__tostring__
+end
diff --git a/contrib/lua-torch/nn/DepthConcat.lua b/contrib/lua-torch/nn/DepthConcat.lua
new file mode 100644
index 000000000..f64a90eb8
--- /dev/null
+++ b/contrib/lua-torch/nn/DepthConcat.lua
@@ -0,0 +1,116 @@
+------------------------------------------------------------------------
+--[[ DepthConcat ]]--
+-- Concatenates the output of Convolutions along the depth dimension
+-- (nOutputFrame). This is used to implement the DepthConcat layer
+-- of the Going deeper with convolutions paper :
+-- http://arxiv.org/pdf/1409.4842v1.pdf
+-- The normal Concat Module can't be used since the spatial dimensions
+-- of tensors to be concatenated may have different values. To deal with
+-- this, we select the largest spatial dimensions and add zero-padding
+-- around the smaller dimensions.
+------------------------------------------------------------------------
+local DepthConcat, _ = torch.class('nn.DepthConcat', 'nn.Concat')
+
+function DepthConcat:windowNarrow(output, currentOutput, offset)
+   local outputWindow = output:narrow(self.dimension, offset, currentOutput:size(self.dimension))
+   for dim=1,self.outputSize:size(1) do
+      local currentSize = currentOutput:size(dim)
+      if dim ~= self.dimension and self.outputSize[dim] ~= currentSize then
+         -- 5x5 vs 3x3 -> start = [(5-3)/2] + 1 = 2 (1 pad each side)
+         -- 9x9 vs 5x5 -> start = [(9-5)/2] + 1 = 3 (2 pad each side)
+         -- 9x9 vs 4x4 -> start = [(9-4)/2] + 1 = 3.5 (2 pad, 3 pad)
+         local start = math.floor(((self.outputSize[dim] - currentSize) / 2) + 1)
+         outputWindow = outputWindow:narrow(dim, start, currentSize)
+      end
+   end
+   return outputWindow
+end
+
+function DepthConcat:updateOutput(input)
+   self.outputSize = self.outputSize or torch.LongStorage()
+
+   local outs = {}
+   for i=1,#self.modules do
+      local currentOutput = self:rethrowErrors(self.modules[i], i, 'updateOutput', input)
+      outs[i] = currentOutput
+      if i == 1 then
+         self.outputSize:resize(currentOutput:dim()):copy(currentOutput:size())
+      else
+         self.outputSize[self.dimension] = self.outputSize[self.dimension] + currentOutput:size(self.dimension)
+         for dim=1,self.outputSize:size(1) do
+            if dim ~= self.dimension then
+               -- take the maximum size (shouldn't change anything for batch dim)
+               self.outputSize[dim] = math.max(self.outputSize[dim], currentOutput:size(dim))
+            end
+         end
+      end
+   end
+   self.output:resize(self.outputSize):zero() --zero for padding
+
+   local offset = 1
+   for i,module in ipairs(self.modules) do
+      local currentOutput = outs[i]
+      local outputWindow = self:windowNarrow(self.output, currentOutput, offset)
+      outputWindow:copy(currentOutput)
+      offset = offset + currentOutput:size(self.dimension)
+   end
+   return self.output
+end
+
+function DepthConcat:updateGradInput(input, gradOutput)
+   self.gradInput:resizeAs(input)
+
+   local offset = 1
+   for i,module in ipairs(self.modules) do
+      local currentOutput = module.output
+      local gradOutputWindow = self:windowNarrow(gradOutput, currentOutput, offset)
+      local currentGradInput = self:rethrowErrors(module, i, 'updateGradInput', input, gradOutputWindow)
+      if i==1 then
+         self.gradInput:copy(currentGradInput)
+      else
+         self.gradInput:add(currentGradInput)
+      end
+      offset = offset + currentOutput:size(self.dimension)
+   end
+   return self.gradInput
+end
+
+function DepthConcat:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   local offset = 1
+   for i,module in ipairs(self.modules) do
+      local currentOutput = module.output
+      local gradOutputWindow = self:windowNarrow(gradOutput, currentOutput, offset)
+      self:rethrowErrors(module, i, 'accGradParameters', input, gradOutputWindow, scale)
+      offset = offset + currentOutput:size(self.dimension)
+   end
+end
+
+function DepthConcat:backward(input, gradOutput, scale)
+   self.gradInput:resizeAs(input)
+
+   scale = scale or 1
+   local offset = 1
+   for i,module in ipairs(self.modules) do
+      local currentOutput = module.output
+      local gradOutputWindow = self:windowNarrow(gradOutput, currentOutput, offset)
+      local currentGradInput = self:rethrowErrors(module, i, 'backward', input, gradOutputWindow)
+      if i==1 then
+         self.gradInput:copy(currentGradInput)
+      else
+         self.gradInput:add(currentGradInput)
+      end
+      offset = offset + currentOutput:size(self.dimension)
+   end
+   return self.gradInput
+end
+
+function DepthConcat:accUpdateGradParameters(input, gradOutput, lr)
+   local offset = 1
+   for i,module in ipairs(self.modules) do
+      local currentOutput = module.output
+      local gradOutputWindow = self:windowNarrow(gradOutput, currentOutput, offset)
+      self:rethrowErrors(module, i, 'accUpdateGradParameters', input, gradOutputWindow, lr)
+      offset = offset + currentOutput:size(self.dimension)
+   end
+end
diff --git a/contrib/lua-torch/nn/DistKLDivCriterion.lua b/contrib/lua-torch/nn/DistKLDivCriterion.lua
new file mode 100644
index 000000000..bfad57567
--- /dev/null
+++ b/contrib/lua-torch/nn/DistKLDivCriterion.lua
@@ -0,0 +1,34 @@
+local DistKLDivCriterion, parent = torch.class('nn.DistKLDivCriterion', 'nn.Criterion')
+
+function DistKLDivCriterion:__init()
+   parent.__init(self)
+   self.sizeAverage = true
+end
+
+function DistKLDivCriterion:updateOutput(input, target)
+   assert(input:dim() == target:dim() and
+      torch.LongTensor(input:size()):eq(torch.LongTensor(target:size())):all(),
+      'input and target should have the same size')
+   self.output_tensor = self.output_tensor or input.new(1)
+   input.THNN.DistKLDivCriterion_updateOutput(
+      input:cdata(),
+      target:cdata(),
+      self.output_tensor:cdata(),
+      self.sizeAverage
+   )
+   self.output = self.output_tensor[1]
+   return self.output
+end
+
+function DistKLDivCriterion:updateGradInput(input, target)
+   assert(input:dim() == target:dim() and
+      torch.LongTensor(input:size()):eq(torch.LongTensor(target:size())):all(),
+      'input and target should have the same size')
+   input.THNN.DistKLDivCriterion_updateGradInput(
+      input:cdata(),
+      target:cdata(),
+      self.gradInput:cdata(),
+      self.sizeAverage
+   )
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/DistanceRatioCriterion.lua b/contrib/lua-torch/nn/DistanceRatioCriterion.lua
new file mode 100644
index 000000000..6b79d0620
--- /dev/null
+++ b/contrib/lua-torch/nn/DistanceRatioCriterion.lua
@@ -0,0 +1,142 @@
+--[[
+   Probabilistic Criterion for Triplet Siamese Model for learning embedding.
+   Ref: https://arxiv.org/pdf/1610.00243.pdf
+
+   loss = -log( exp(-X) / ( exp(-X) + exp(-Y) ) )
+   where
+   X : Distance between similar samples
+   Y : Distance between dissimilar samples
+
+   The loss could be break down to following log expansion
+
+   loss = -log( exp(-X) ) - (-log( exp(-X) + exp(-Y) ))
+        = -log( exp(-X) ) + log( exp(-X) + exp(-Y) )
+        = -(-X) + log( exp(-X) + exp(-Y) )
+        = X + log( exp(-X) + exp(-Y) )
+
+   Gradients:
+      dLoss/dX = 1 + 1 / (exp(-X) + exp(-Y)) * -1 * exp(-X)
+               = 1 - exp(-X) / (exp(-X) + exp(-Y))
+
+      dLoss/dY = 0 + 1 / (exp(-X) + exp(-Y)) * -1 * exp(-Y)
+               = -exp(-Y) / (exp(-X) + exp(-Y))
+
+--]]
+
+local DistanceRatioCriterion, parent = torch.class('nn.DistanceRatioCriterion',
+                                                   'nn.Criterion')
+
+function DistanceRatioCriterion:__init(sizeAverage)
+   parent.__init(self)
+   if sizeAverage ~= nil then
+     self.sizeAverage = sizeAverage
+   else
+     self.sizeAverage = true
+   end
+end
+
+-- Forward
+--[[
+-- X : Distance between similar samples
+-- Y : Distance between dissimilar samples
+   loss = -log( exp(-X) ) - (-log( exp(-X) + exp(-Y) ))
+        = -log( exp(-X) ) + log( exp(-X) + exp(-Y) )
+        = -(-X) + log( exp(-X) + exp(-Y) )
+        = X + log( exp(-X) + exp(-Y) )
+--]]
+function DistanceRatioCriterion:updateOutput(input)
+   assert(#input == 2, "Invalid number of inputs")
+
+   local X = input[1]
+   local Y = input[2]
+
+   assert(X:nElement() == Y:nElement(), "Number of distances don't match.")
+   assert(X:size(1) == Y:size(1), "Invalid distances' size.")
+
+   -- Compute exp(-X) and exp(-Y)
+   self._expMinusX = self._expMinusX or X.new()
+   self._expMinusY = self._expMinusY or Y.new()
+
+   -- Compute ( exp(-X) + exp(-Y) )
+   self._expMinusX:resizeAs(X):copy(X):mul(-1):exp()
+   self._expMinusY:resizeAs(Y):copy(Y):mul(-1):exp()
+
+   self._sumExpMinusXY = self.sumExpMinusExp or X.new()
+   self._sumExpMinusXY:resizeAs(self._expMinusX):copy(self._expMinusX)
+                     :add(self._expMinusY)
+
+   -- Compute log( exp(-X) + exp(-Y) )
+   self._logSumExpMinusXY = self._logSumExpMinusXY or self._sumExpMinusXY.new()
+   self._logSumExpMinusXY:resizeAs(self._sumExpMinusXY)
+                         :copy(self._sumExpMinusXY):log()
+
+   -- Compute log( exp(-X) + exp(-Y) )
+   self.loss = self.loss or self._logSumExpMinusXY.new()
+   self.loss:resizeAs(X):copy(X):add(self._logSumExpMinusXY)
+
+   if self.sizeAverage then
+      return self.loss:sum()/X:size(1)
+   else
+      return self.loss:sum()
+   end
+end
+
+-- Backward
+--[[
+-- X : Distance between similar samples
+-- Y : Distance between dissimilar samples
+
+   Gradients:
+      dLoss/dX = 1 + 1 / (exp(-X) + exp(-Y)) * -1 * exp(-X)
+               = 1 - exp(-X) / (exp(-X) + exp(-Y))
+
+      dLoss/dY = 0 + 1 / (exp(-X) + exp(-Y)) * -1 * exp(-Y)
+               = -exp(-Y) / (exp(-X) + exp(-Y))
+
+--]]
+function DistanceRatioCriterion:updateGradInput(input)
+   assert(#input == 2, "Invalid number of inputs")
+   local X = input[1]
+   local Y = input[2]
+   assert(X:nElement() == Y:nElement(), "Number of distances don't match.")
+   assert(X:size(1) == Y:size(1), "Invalid distances' size.")
+
+   -- dLoss/dX
+   -- -exp(-X)
+   self.dX = self.dX or X.new()
+   self.dX:resizeAs(self._expMinusX):copy(self._expMinusX):mul(-1)
+
+   -- -exp(-X) / (exp(-X) + exp(-Y))
+   self.dX:cdiv(self._sumExpMinusXY)
+
+   -- 1 - exp(-X) / (exp(-X) + exp(-Y))
+   self.dX:add(1)
+
+   -- dLoss/dY
+   -- -exp(-Y)
+   self.dY = self.dY or Y.new()
+   self.dY:resizeAs(self._expMinusY):copy(self._expMinusY):mul(-1)
+
+   -- -exp(-Y) / (exp(-X) + exp(-Y))
+   self.dY:cdiv(self._sumExpMinusXY)
+
+   if self.sizeAverage then
+      self.dX:div(X:size(1))
+      self.dY:div(X:size(1))
+   end
+
+   return {self.dX, self.dY}
+end
+
+function DistanceRatioCriterion:type(type, tensorCache)
+   if type then
+      self._expMinusX = nil
+      self._expMinusY = nil
+      self._sumExpMinusXY = nil
+      self._logSumExpMinusXY = nil
+      self.loss = nil
+      self.dX = nil
+      self.dY = nil
+   end
+   return parent.type(self, type, tensorCache)
+end
diff --git a/contrib/lua-torch/nn/DontCast.lua b/contrib/lua-torch/nn/DontCast.lua
new file mode 100644
index 000000000..b89f5436b
--- /dev/null
+++ b/contrib/lua-torch/nn/DontCast.lua
@@ -0,0 +1,124 @@
+local DontCast, parent = torch.class("nn.DontCast", "nn.Decorator")
+
+-- utility functions
+
+local function recursiveTypeCopy(dst, src, type_str)
+   if torch.type(src) == 'table' then
+      dst = (torch.type(dst) == 'table') and dst or {}
+      for k, v in pairs(src) do
+         dst[k] = recursiveTypeCopy(dst[k], v, type_str)
+      end
+   elseif torch.isTensor(src) then
+      dst = (torch.type(dst) == type_str) and dst or torch.getmetatable(type_str).new()
+      dst:resize(src:size())
+      if src:nElement() > 0 then
+         dst:copy(src)
+      end
+   end
+   return dst
+end
+
+local function tableTensorType(src)
+   if type(src) == 'table' then
+      local type_str, found
+      for k,v in pairs(src) do
+         type_str, found = tableTensorType(v)
+         if found then
+            return type_str, true
+         end
+      end
+      return type_str, found
+   else
+      return torch.type(src), torch.isTensor(src)
+   end
+end
+
+-- DontCast methods and constructor
+
+function DontCast:__init(module, castin, castout, moduleType)
+   parent.__init(self, module)
+   self.castin = castin
+   self.castout = (castout == nil) and castin or castout
+   self.moduleType = moduleType
+   if (self.castin or self.castout) and not self.moduleType then
+      local moduleType, found = tableTensorType(module.output)
+      if found then
+         self.moduleType = moduleType
+      else
+         moduleType, found = tableTensorType(module:parameters())
+         if found then
+            self.moduleType = moduleType
+         else
+            error"Cannot extrapolate moduleType. Provide constructor argument 4"
+         end
+      end
+   end
+end
+
+function DontCast:updateOutput(input)
+   if self.castin and tableTensorType(input) ~= self.moduleType then
+      self._input = recursiveTypeCopy(self._input, input, self.moduleType)
+      input = self._input
+   end
+
+   local output = self.modules[1]:updateOutput(input)
+
+   if self.castout then
+      self.output = recursiveTypeCopy(self.output, output, tableTensorType(self.output))
+   else
+      self.output = output
+   end
+   return self.output
+end
+
+function DontCast:updateGradInput(input, gradOutput)
+   if self.castin and tableTensorType(input) ~= self.moduleType then
+      input = self._input
+   end
+   if self.castout and tableTensorType(gradOutput) ~= self.moduleType then
+      self._gradOutput = recursiveTypeCopy(self._gradOutput, gradOutput, self.moduleType)
+      gradOutput = self._gradOutput
+   end
+
+   local gradInput = self.modules[1]:updateGradInput(input, gradOutput)
+
+   if self.castin then
+      self.gradInput = recursiveTypeCopy(self.gradInput, gradInput, tableTensorType(self.gradInput))
+   else
+      self.gradInput = gradInput
+   end
+   return self.gradInput
+end
+
+function DontCast:accGradParameters(input, gradOutput, scale)
+   if self.castin and tableTensorType(input) ~= self.moduleType then
+      input = self._input
+   end
+   if self.castout and tableTensorType(gradOutput) ~= self.moduleType then
+      gradOutput = self._gradOutput
+   end
+
+   self.modules[1]:accGradParameters(input, gradOutput, scale)
+end
+
+function DontCast:accUpdateGradParameters(input, gradOutput, lr)
+   if self.castin and tableTensorType(input) ~= self.moduleType then
+      input = self._input
+   end
+   if self.castout and tableTensorType(gradOutput) ~= self.moduleType then
+      gradOutput = self._gradOutput
+   end
+
+   self.modules[1]:accUpdateGradParameters(input, gradOutput, lr)
+end
+
+-- dont cast (the essence thereof)
+function DontCast:type(type)
+   if self.castout and tableTensorType(self.output) ~= type then
+      self.output = recursiveTypeCopy(nil, self.output, type)
+   end
+   if self.castin and tableTensorType(self.gradInput) ~= type then
+      self.gradInput = recursiveTypeCopy(nil, self.gradInput, type)
+   end
+   return self
+end
diff --git a/contrib/lua-torch/nn/DotProduct.lua b/contrib/lua-torch/nn/DotProduct.lua
new file mode 100644
index 000000000..ccd347e6b
--- /dev/null
+++ b/contrib/lua-torch/nn/DotProduct.lua
@@ -0,0 +1,61 @@
+local DotProduct, parent = torch.class('nn.DotProduct', 'nn.Module')
+
+function DotProduct:__init()
+   parent.__init(self)
+   self.gradInput = {torch.Tensor(), torch.Tensor()}
+end
+
+function DotProduct:updateOutput(input)
+   local input1, input2 = input[1], input[2]
+   if input1:dim() == 1 then
+      -- convert non batch input to batch input
+      input1 = input1:view(1,-1)
+      input2 = input2:view(1,-1)
+   end
+   if not self.buffer then
+      self.buffer = input1.new()
+   end
+   self.buffer:cmul(input1, input2)
+   self.output:sum(self.buffer, 2)
+   self.output:resize(input1:size(1))
+   return self.output
+end
+
+function DotProduct:updateGradInput(input, gradOutput)
+   local v1 = input[1]
+   local v2 = input[2]
+   local not_batch = false
+
+   if #self.gradInput ~= 2 then
+     self.gradInput[1] = self.gradInput[1] or input[1].new()
+     self.gradInput[2] = self.gradInput[2] or input[2].new()
+   end
+
+   if v1:dim() == 1 then
+      v1 = v1:view(1,-1)
+      v2 = v2:view(1,-1)
+      not_batch = true
+   end
+
+   local gw1 = self.gradInput[1]
+   local gw2 = self.gradInput[2]
+   gw1:resizeAs(v1):copy(v2)
+   gw2:resizeAs(v2):copy(v1)
+
+   local go = gradOutput:view(-1,1):expandAs(v1)
+   gw1:cmul(go)
+   gw2:cmul(go)
+
+   if not_batch then
+      -- unbatch gradInput
+      self.gradInput[1]:set(gw1:select(1,1))
+      self.gradInput[2]:set(gw2:select(1,1))
+   end
+
+   return self.gradInput
+end
+
+function DotProduct:clearState()
+   if self.buffer then self.buffer:set() end
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/Dropout.lua b/contrib/lua-torch/nn/Dropout.lua
new file mode 100644
index 000000000..15f2f4699
--- /dev/null
+++ b/contrib/lua-torch/nn/Dropout.lua
@@ -0,0 +1,70 @@
+local Dropout, Parent = torch.class('nn.Dropout', 'nn.Module')
+
+function Dropout:__init(p,v1,inplace,stochasticInference)
+   Parent.__init(self)
+   self.p = p or 0.5
+   self.train = true
+   self.inplace = inplace
+   self.stochastic_inference = stochasticInference or false
+   -- version 2 scales output during training instead of evaluation
+   self.v2 = not v1
+   if self.p >= 1 or self.p < 0 then
+      error('<Dropout> illegal percentage, must be 0 <= p < 1')
+   end
+   self.noise = torch.Tensor()
+end
+
+function Dropout:updateOutput(input)
+   if self.inplace then
+      self.output:set(input)
+   else
+      self.output:resizeAs(input):copy(input)
+   end
+   if self.p > 0 then
+      if self.train or self.stochastic_inference then
+         self.noise:resizeAs(input)
+         self.noise:bernoulli(1-self.p)
+         if self.v2 then
+            self.noise:div(1-self.p)
+         end
+         self.output:cmul(self.noise)
+      elseif not self.v2 then
+         self.output:mul(1-self.p)
+      end
+   end
+   return self.output
+end
+
+function Dropout:updateGradInput(input, gradOutput)
+   if self.inplace then
+      self.gradInput:set(gradOutput)
+   else
+      self.gradInput:resizeAs(gradOutput):copy(gradOutput)
+   end
+   if self.train then
+      if self.p > 0 then
+         self.gradInput:cmul(self.noise) -- simply mask the gradients with the noise vector
+      end
+   else
+      if not self.v2 and self.p > 0 then
+         self.gradInput:mul(1-self.p)
+      end
+   end
+   return self.gradInput
+end
+
+function Dropout:setp(p)
+   self.p = p
+end
+
+function Dropout:__tostring__()
+   return string.format('%s(%f)', torch.type(self), self.p)
+end
+
+
+function Dropout:clearState()
+   if self.noise then
+      self.noise:set()
+   end
+   return Parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/ELU.lua b/contrib/lua-torch/nn/ELU.lua
new file mode 100644
index 000000000..48a6caa2c
--- /dev/null
+++ b/contrib/lua-torch/nn/ELU.lua
@@ -0,0 +1,45 @@
+local ELU, parent = torch.class('nn.ELU', 'nn.Module')
+
+--[[
+   Djork-Arné Clevert, Thomas Unterthiner, Sepp Hochreiter
+   Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
+   http://arxiv.org/pdf/1511.07289.pdf
+--]]
+
+function ELU:__init(alpha, inplace)
+   parent.__init(self)
+   self.alpha = alpha or 1
+   assert(type(self.alpha) == 'number')
+   self.inplace = inplace or false
+   assert(type(self.inplace) == 'boolean')
+end
+
+function ELU:updateOutput(input)
+   local inplace = self.inplace or false
+
+   input.THNN.ELU_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.alpha,
+      inplace
+   )
+   return self.output
+end
+
+function ELU:updateGradInput(input, gradOutput)
+   local inplace = self.inplace or false
+
+   input.THNN.ELU_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.output:cdata(),
+      self.alpha,
+      inplace
+   )
+   return self.gradInput
+end
+
+function ELU:__tostring__()
+  return string.format('%s (alpha:%f)', torch.type(self), self.alpha)
+end
diff --git a/contrib/lua-torch/nn/ErrorMessages.lua b/contrib/lua-torch/nn/ErrorMessages.lua
new file mode 100644
index 000000000..a5cbed053
--- /dev/null
+++ b/contrib/lua-torch/nn/ErrorMessages.lua
@@ -0,0 +1,19 @@
+
+local mt = {
+  __index = function(table, key)
+    error("nn."..key.." is only supported for Float or Double Tensors.")
+  end
+}
+
+local tensors = {
+  torch.ByteTensor,
+  torch.CharTensor,
+  torch.ShortTensor,
+  torch.IntTensor,
+  torch.LongTensor,
+}
+
+for _, t in ipairs(tensors) do
+  t.nn = {}
+  setmetatable(t.nn, mt)
+end
diff --git a/contrib/lua-torch/nn/Euclidean.lua b/contrib/lua-torch/nn/Euclidean.lua
new file mode 100644
index 000000000..509feff50
--- /dev/null
+++ b/contrib/lua-torch/nn/Euclidean.lua
@@ -0,0 +1,197 @@
+local Euclidean, parent = torch.class('nn.Euclidean', 'nn.Module')
+
+function Euclidean:__init(inputSize,outputSize)
+   parent.__init(self)
+
+   self.weight = torch.Tensor(inputSize,outputSize)
+   self.gradWeight = torch.Tensor(inputSize,outputSize)
+
+   -- state
+   self.gradInput:resize(inputSize)
+   self.output:resize(outputSize)
+
+   self.fastBackward = true
+
+   self:reset()
+end
+
+function Euclidean:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1./math.sqrt(self.weight:size(1))
+   end
+   if nn.oldSeed then
+      for i=1,self.weight:size(2) do
+         self.weight:select(2, i):apply(function()
+            return torch.uniform(-stdv, stdv)
+         end)
+      end
+   else
+      self.weight:uniform(-stdv, stdv)
+   end
+end
+
+local function view(res, src, ...)
+   local args = {...}
+   if src:isContiguous() then
+      res:view(src, table.unpack(args))
+   else
+      res:reshape(src, table.unpack(args))
+   end
+end
+
+function Euclidean:updateOutput(input)
+   -- lazy initialize buffers
+   self._input = self._input or input.new()
+   self._weight = self._weight or self.weight.new()
+   self._expand = self._expand or self.output.new()
+   self._expand2 = self._expand2 or self.output.new()
+   self._repeat = self._repeat or self.output.new()
+   self._repeat2 = self._repeat2 or self.output.new()
+
+   local inputSize, outputSize = self.weight:size(1), self.weight:size(2)
+
+   -- y_j = || w_j - x || = || x - w_j ||
+   if input:dim() == 1 then
+      view(self._input, input, inputSize, 1)
+      self._expand:expandAs(self._input, self.weight)
+      self._repeat:resizeAs(self._expand):copy(self._expand)
+      self._repeat:add(-1, self.weight)
+      self.output:norm(self._repeat, 2, 1)
+      self.output:resize(outputSize)
+   elseif input:dim() == 2 then
+      local batchSize = input:size(1)
+
+      view(self._input, input, batchSize, inputSize, 1)
+      self._expand:expand(self._input, batchSize, inputSize, outputSize)
+      -- make the expanded tensor contiguous (requires lots of memory)
+      self._repeat:resizeAs(self._expand):copy(self._expand)
+
+      self._weight:view(self.weight, 1, inputSize, outputSize)
+      self._expand2:expandAs(self._weight, self._repeat)
+
+      if torch.type(input) == 'torch.CudaTensor' then
+         -- requires lots of memory, but minimizes cudaMallocs and loops
+         self._repeat2:resizeAs(self._expand2):copy(self._expand2)
+         self._repeat:add(-1, self._repeat2)
+      else
+         self._repeat:add(-1, self._expand2)
+      end
+
+      self.output:norm(self._repeat, 2, 2)
+      self.output:resize(batchSize, outputSize)
+   else
+      error"1D or 2D input expected"
+   end
+
+   return self.output
+end
+
+function Euclidean:updateGradInput(input, gradOutput)
+   if not self.gradInput then
+      return
+   end
+
+   self._div = self._div or input.new()
+   self._output = self._output or self.output.new()
+   self._gradOutput = self._gradOutput or input.new()
+   self._expand3 = self._expand3 or input.new()
+
+   if not self.fastBackward then
+      self:updateOutput(input)
+   end
+
+   local inputSize, outputSize = self.weight:size(1), self.weight:size(2)
+
+   --[[
+   dy_j   -2 * (w_j - x)     x - w_j
+   ---- = ---------------  = -------
+    dx    2 || w_j - x ||      y_j
+   --]]
+
+   -- to prevent div by zero (NaN) bugs
+   self._output:resizeAs(self.output):copy(self.output):add(0.0000001)
+   view(self._gradOutput, gradOutput, gradOutput:size())
+   self._div:cdiv(gradOutput, self._output)
+   if input:dim() == 1 then
+      self._div:resize(1, outputSize)
+      self._expand3:expandAs(self._div, self.weight)
+
+      if torch.type(input) == 'torch.CudaTensor' then
+         self._repeat2:resizeAs(self._expand3):copy(self._expand3)
+         self._repeat2:cmul(self._repeat)
+      else
+         self._repeat2:cmul(self._repeat, self._expand3)
+      end
+
+      self.gradInput:sum(self._repeat2, 2)
+      self.gradInput:resizeAs(input)
+   elseif input:dim() == 2 then
+      local batchSize = input:size(1)
+
+      self._div:resize(batchSize, 1, outputSize)
+      self._expand3:expand(self._div, batchSize, inputSize, outputSize)
+
+      if torch.type(input) == 'torch.CudaTensor' then
+         self._repeat2:resizeAs(self._expand3):copy(self._expand3)
+         self._repeat2:cmul(self._repeat)
+      else
+         self._repeat2:cmul(self._repeat, self._expand3)
+      end
+
+      self.gradInput:sum(self._repeat2, 3)
+      self.gradInput:resizeAs(input)
+   else
+      error"1D or 2D input expected"
+   end
+
+   return self.gradInput
+end
+
+function Euclidean:accGradParameters(input, gradOutput, scale)
+   local inputSize, outputSize = self.weight:size(1), self.weight:size(2)
+   scale = scale or 1
+
+   --[[
+   dy_j    2 * (w_j - x)     w_j - x
+   ---- = ---------------  = -------
+   dw_j   2 || w_j - x ||      y_j
+   --]]
+   -- assumes a preceding call to updateGradInput
+   if input:dim() == 1 then
+      self.gradWeight:add(-scale, self._repeat2)
+   elseif input:dim() == 2 then
+      self._sum = self._sum or input.new()
+      self._sum:sum(self._repeat2, 1)
+      self._sum:resize(inputSize, outputSize)
+      self.gradWeight:add(-scale, self._sum)
+   else
+      error"1D or 2D input expected"
+   end
+end
+
+function Euclidean:type(type, tensorCache)
+   if type then
+      -- prevent premature memory allocations
+      self:clearState()
+   end
+   return parent.type(self, type, tensorCache)
+end
+
+function Euclidean:clearState()
+   nn.utils.clear(self, {
+      '_input',
+      '_output',
+      '_gradOutput',
+      '_weight',
+      '_div',
+      '_sum',
+      '_expand',
+      '_expand2',
+      '_expand3',
+      '_repeat',
+      '_repeat2',
+   })
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/Exp.lua b/contrib/lua-torch/nn/Exp.lua
new file mode 100644
index 000000000..f41569026
--- /dev/null
+++ b/contrib/lua-torch/nn/Exp.lua
@@ -0,0 +1,9 @@
+local Exp = torch.class('nn.Exp', 'nn.Module')
+
+function Exp:updateOutput(input)
+  return self.output:exp(input)
+end
+
+function Exp:updateGradInput(input, gradOutput)
+  return self.gradInput:cmul(self.output, gradOutput)
+end
diff --git a/contrib/lua-torch/nn/FlattenTable.lua b/contrib/lua-torch/nn/FlattenTable.lua
new file mode 100644
index 000000000..1c182557c
--- /dev/null
+++ b/contrib/lua-torch/nn/FlattenTable.lua
@@ -0,0 +1,106 @@
+local FlattenTable, parent = torch.class('nn.FlattenTable', 'nn.Module')
+
+function FlattenTable:__init()
+  parent.__init(self)
+
+  self.output = {}
+  self.input_map = {}
+  self.gradInput = {}
+end
+
+-- Recursive function to flatten a table (output is a table)
+local function flatten(output, input)
+  local input_map  -- has the same structure as input, but stores the
+                   -- indices to the corresponding output
+  if type(input) == 'table' then
+    input_map = {}
+    -- forward DFS order
+    for i = 1, #input do
+      input_map[#input_map+1] = flatten(output, input[i])
+    end
+  else
+    input_map = #output + 1
+    output[input_map] = input  -- append the tensor
+  end
+  return input_map
+end
+
+-- Recursive function to check if we need to rebuild the output table
+local function checkMapping(output, input, input_map)
+  if input_map == nil or output == nil or input == nil then
+    return false
+  end
+  if type(input) == 'table' then
+    if type(input_map) ~= 'table' then
+      return false
+    end
+    if #input ~= #input_map then
+      return false
+    end
+    -- forward DFS order
+    for i = 1, #input do
+       local ok = checkMapping(output, input[i], input_map[i])
+       if not ok then
+          return false
+       end
+    end
+    return true
+  else
+    if type(input_map) ~= 'number' then
+      return false
+    end
+    return output[input_map] == input
+  end
+end
+
+-- During BPROP we have to build a gradInput with the same shape as the
+-- input.  This is a recursive function to build up a gradInput
+local function inverseFlatten(gradOutput, input_map)
+  if type(input_map) == 'table' then
+    local gradInput = {}
+    for i = 1, #input_map do
+      gradInput[#gradInput + 1] = inverseFlatten(gradOutput, input_map[i])
+    end
+    return gradInput
+  else
+    return gradOutput[input_map]
+  end
+end
+
+function FlattenTable:updateOutput(input)
+  assert(type(input) == 'table', 'input must be a table')
+  -- to avoid updating rebuilding the flattened table every updateOutput call
+  -- we will do a DFS pass over the existing output table and the inputs to
+  -- see if it needs to be rebuilt.
+  if not checkMapping(self.output, input, self.input_map) then
+    self.output = {}
+    self.input_map = flatten(self.output, input)
+  end
+  return self.output
+end
+
+function FlattenTable:updateGradInput(input, gradOutput)
+  assert(type(input) == 'table', 'input must be a table')
+  assert(type(input) == 'table', 'gradOutput must be a table')
+  -- If the input changes between the updateOutput and updateGradInput call,
+  -- then we may have to rebuild the input_map!  However, let's assume that
+  -- the input_map is valid and that forward has already been called.
+
+  -- However, we should check that the gradInput is valid:
+  if not checkMapping(gradOutput, self.gradInput, self.input_map) then
+    self.gradInput = inverseFlatten(gradOutput, self.input_map)
+  end
+
+  return self.gradInput
+end
+
+function FlattenTable:type(type, tensorCache)
+  -- This function just stores references so we don't need to do any type
+  -- conversions.  Just force the tables to be empty.
+  self:clearState()
+end
+
+function FlattenTable:clearState()
+  self.input_map = {}
+  return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/GPU.lua b/contrib/lua-torch/nn/GPU.lua
new file mode 100644
index 000000000..758618d8b
--- /dev/null
+++ b/contrib/lua-torch/nn/GPU.lua
@@ -0,0 +1,273 @@
+------------------------------------------------------------------------
+--[[ GPU ]]--
+-- Decorates a module such that its parameters are
+-- hosted on a specified GPU device.
+-- The operations are also executed on that device.
+-- Arguments input and gradOutput are converted to the specified device
+-- before being fed to the decorated module.
+-- Returned output is on the specified outdevice (defaults to device).
+-- Returned gradInput is allocated on the same device as the input.
+-- The unit test is located in cunn.
+------------------------------------------------------------------------
+local GPU, parent = torch.class("nn.GPU", "nn.Container")
+
+function GPU:__init(module, device, outdevice)
+   parent.__init(self)
+   assert(torch.type(device) == 'number')
+   self.device = device
+   self.outdevice = outdevice or device
+
+   assert(torch.isTypeOf(module, 'nn.Module'))
+   self.modules[1] = module
+
+   if module:type():find('torch%.Cuda.*Tensor') then
+      self:type(module:type())
+   end
+end
+
+function GPU.recursiveModuleDevice(obj, device)
+   if type(obj) == 'table' and not torch.isTypeOf(obj, 'nn.GPU') and not obj.__noGPU__ then
+      for k,v in pairs(obj) do
+         obj[k] = GPU.recursiveModuleDevice(v, device)
+      end
+   elseif torch.type(obj):match('torch.Cuda.*Tensor') then
+      if obj:getDevice() ~= device then
+         obj = obj:clone() -- this will reallocate it to device
+         local newdevice = obj:getDevice()
+         -- when nElement() == 0 newdevice is 0
+         assert(newdevice == device or newdevice == 0)
+      end
+   end
+   assert(obj ~= nil)
+   return obj
+end
+
+-- set the device of the decorated module
+function GPU:setDevice(device)
+   self.device = device or self.device
+
+   assert(self.modules[1])
+   self.modules[1] = cutorch.withDevice(self.device, function()
+      return self.recursiveModuleDevice(self.modules[1], self.device)
+   end)
+   return self
+end
+
+-- when proto is a device number, returns a dst that has device device for each element in src
+-- otherwise, if proto is a table/tensor, makes sure dst is a identical to src, yet on the same device as proto
+function GPU.recursiveSetDevice(dst, src, proto)
+   local device, prototable
+   if torch.isTensor(proto) then
+      device = proto:getDevice()
+   elseif torch.type(proto) == 'number' then
+      device = proto
+   elseif torch.type(proto) == 'table' then
+      prototable = true
+   else
+      error"Expecting number, table or tensor for arg 3 (proto)"
+   end
+   if torch.type(src) == 'table' then
+      dst = torch.type(dst) == 'table' and dst or {}
+      for k,v in ipairs(src) do
+         dst[k] = GPU.recursiveSetDevice(dst[k], v, prototable and proto[k] or device)
+      end
+      for k=#src+1,#dst do
+         dst[k] = nil
+      end
+   elseif torch.type(src):match('torch.Cuda.*Tensor') and src:getDevice() ~= device and src:getDevice() ~= 0 then
+      if not (torch.type(dst):match('torch.Cuda.*Tensor') and dst:getDevice() == device) then
+         dst = src.new()
+      end
+      cutorch.withDevice(device, function() dst:resizeAs(src):copy(src) end)
+   else
+      dst = src
+   end
+   return dst
+end
+
+function GPU:updateOutput(input)
+   if self._type:find('torch%.Cuda.*Tensor') then
+      self._input = self.recursiveSetDevice(self._input, input, self.device)
+
+      local output = cutorch.withDevice(self.device, function()
+         return self.modules[1]:updateOutput(self._input)
+      end)
+
+      if self.device ~= self.outdevice then
+         self.output = self.recursiveSetDevice(self.output, output, self.outdevice)
+      else
+         self.output = output
+      end
+   else
+      self.output = self.modules[1]:updateOutput(input)
+   end
+
+   return self.output
+end
+
+function GPU:updateGradInput(input, gradOutput)
+   if self._type:find('torch%.Cuda.*Tensor') then
+      self._gradOutput = self.recursiveSetDevice(self._gradOutput, gradOutput, self.device)
+
+      local gradInput = cutorch.withDevice(self.device, function()
+         return self.modules[1]:updateGradInput(self._input, self._gradOutput)
+      end)
+
+      self.gradInput = self.recursiveSetDevice(self.gradInput, gradInput, input)
+   else
+      self.gradInput = self.modules[1]:updateGradInput(input, gradOutput)
+   end
+
+   return self.gradInput
+end
+
+function GPU:accGradParameters(input, gradOutput, scale)
+   if self._type:find('torch%.Cuda.*Tensor') then
+      cutorch.withDevice(self.device, function()
+         self.modules[1]:accGradParameters(self._input, self._gradOutput, scale)
+      end)
+   else
+      self.modules[1]:accGradParameters(input, gradOutput, scale)
+   end
+end
+
+function GPU:apply(callback)
+   if self._type:find('torch%.Cuda.*Tensor') then
+      cutorch.withDevice(self.device, function() parent.apply(self, callback) end)
+   else
+      parent.apply(self, callback)
+   end
+end
+
+function GPU:type(type, typecache)
+   if type and type:find('torch%.Cuda.*Tensor') then
+      cutorch.withDevice(self.device, function() parent.type(self, type, typecache) end)
+      self:setDevice()
+   else
+      self.output = nil
+      self.gradInput = nil
+      self._input = nil
+      self._gradOutput = nil
+      parent.type(self, type, typecache)
+   end
+   return self
+end
+
+function GPU:clearState()
+   nn.utils.clear(self, 'output', 'gradInput')
+   self._input = nil
+   self._gradOutput = nil
+   if self._type:find('torch%.Cuda.*Tensor') then
+      cutorch.withDevice(self.device, function() parent.clearState(self) end)
+   else
+      parent.clearState(self)
+   end
+end
+
+function GPU:zeroGradParameters()
+   if self._type:find('torch%.Cuda.*Tensor') then
+      cutorch.withDevice(self.device, function() parent.zeroGradParameters(self) end)
+   else
+      parent.zeroGradParameters(self)
+   end
+end
+
+function GPU:updateParameters(lr)
+   if self._type:find('torch%.Cuda.*Tensor') then
+      cutorch.withDevice(self.device, function() parent.updateParameters(self, lr) end)
+   else
+      parent.updateParameters(self, lr)
+   end
+end
+
+function GPU:training()
+   if self._type:find('torch%.Cuda.*Tensor') then
+      cutorch.withDevice(self.device, function() parent.training(self) end)
+   else
+      parent.training(self)
+   end
+end
+
+function GPU:evaluate()
+   if self._type:find('torch%.Cuda.*Tensor') then
+      cutorch.withDevice(self.device, function() parent.evaluate(self) end)
+   else
+      parent.evaluate(self)
+   end
+end
+
+function GPU:share(mlp, ...)
+   local args = {...}
+   if self._type:find('torch%.Cuda.*Tensor') then
+      cutorch.withDevice(self.device, function() parent.share(self, mlp, unpack(args)) end)
+   else
+      parent.share(self, mlp, unpack(args))
+   end
+   return self
+end
+
+function GPU:reset(...)
+   local args = {...}
+   if self._type:find('torch%.Cuda.*Tensor') then
+      cutorch.withDevice(self.device, function() parent.reset(self, unpack(args)) end)
+   else
+      parent.reset(self, unpack(args))
+   end
+   return self
+end
+
+function GPU:clone(...)
+   local args = {...}
+   if self._type:find('torch%.Cuda.*Tensor') then
+      return cutorch.withDevice(self.device, function() parent.clone(self, unpack(args)) end)
+   else
+      return parent.clone(self, unpack(args))
+   end
+end
+
+function GPU:write(file)
+   -- Write all values in the object as a table.
+   local object = {}
+   for k, v in pairs(self) do
+      object[k] = v
+   end
+   local header = {self._type, self.device}
+   file:writeObject(header)
+   file:writeObject(object)
+end
+
+function GPU:read(file)
+   local header = file:readObject()
+   local object
+   if header[1] and header[1]:find('torch%.Cuda.*Tensor') then
+      local device = header[2]
+      if device > cutorch.getDeviceCount() then
+         print"Warning : model was saved with more devices than available on current host."
+         print"Attempting to load module onto device 1"
+         device = 1
+      end
+      object = cutorch.withDevice(device, function() return file:readObject() end)
+   else
+      object = file:readObject()
+   end
+
+   for k, v in pairs(object) do
+      self[k] = v
+   end
+end
+
+function GPU:__tostring__()
+   if self.modules[1].__tostring__ then
+      return torch.type(self) .. '(' .. self.device ..') @ ' .. self.modules[1]:__tostring__()
+   else
+      return torch.type(self) .. '(' .. self.device ..') @ ' .. torch.type(self.modules[1])
+   end
+end
+
+function GPU:accUpdateGradParameters(input, gradOutput, lr)
+   error("Not Implemented for "..torch.type(self))
+end
+
+function GPU:sharedAccUpdateGradParameters(input, gradOutput, lr)
+   error("Not Implemented for "..torch.type(self))
+end
diff --git a/contrib/lua-torch/nn/GatedLinearUnit.lua b/contrib/lua-torch/nn/GatedLinearUnit.lua
new file mode 100644
index 000000000..5273abfd4
--- /dev/null
+++ b/contrib/lua-torch/nn/GatedLinearUnit.lua
@@ -0,0 +1,27 @@
+local GatedLinearUnit, parent = torch.class('nn.GatedLinearUnit', 'nn.Module')
+
+function GatedLinearUnit:__init(dim)
+   parent.__init(self)
+   self.dim = dim
+end
+
+function GatedLinearUnit:updateOutput(input)
+   local dim = self.dim or input:dim()
+   input.THNN.GatedLinear_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      dim
+   )
+   return self.output
+end
+
+function GatedLinearUnit:updateGradInput(input, gradOutput)
+   local dim = self.dim or input:dim()
+   input.THNN.GatedLinear_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      dim
+   )
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/GradientReversal.lua b/contrib/lua-torch/nn/GradientReversal.lua
new file mode 100644
index 000000000..c08b1dfb0
--- /dev/null
+++ b/contrib/lua-torch/nn/GradientReversal.lua
@@ -0,0 +1,32 @@
+local GradientReversal, parent = torch.class('nn.GradientReversal', 'nn.Module')
+
+GradientReversal.__version = 2
+
+function GradientReversal:__init(lambda)
+   lambda = lambda or 1
+   parent.__init(self)
+   self.lambda = lambda
+end
+
+function GradientReversal:setLambda(lambda)
+  self.lambda = lambda
+end
+
+function GradientReversal:updateOutput(input)
+   self.output:set(input)
+   return self.output
+end
+
+function GradientReversal:updateGradInput(input, gradOutput)
+   self.gradInput:resizeAs(gradOutput)
+   self.gradInput:copy(gradOutput)
+   self.gradInput:mul(-self.lambda)
+   return self.gradInput
+end
+
+function GradientReversal:read(file, version)
+   parent.read(self, file)
+   if version < 2 then
+      self.lambda = 1
+   end
+end
diff --git a/contrib/lua-torch/nn/HardShrink.lua b/contrib/lua-torch/nn/HardShrink.lua
new file mode 100644
index 000000000..85ff5909c
--- /dev/null
+++ b/contrib/lua-torch/nn/HardShrink.lua
@@ -0,0 +1,25 @@
+local HardShrink, parent = torch.class('nn.HardShrink', 'nn.Module')
+
+function HardShrink:__init(lam)
+   parent.__init(self)
+   self.lambda = lam or 0.5
+end
+
+function HardShrink:updateOutput(input)
+   input.THNN.HardShrink_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.lambda
+   )
+   return self.output
+end
+
+function HardShrink:updateGradInput(input, gradOutput)
+   input.THNN.HardShrink_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.lambda
+   )
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/HardTanh.lua b/contrib/lua-torch/nn/HardTanh.lua
new file mode 100644
index 000000000..07cfc6255
--- /dev/null
+++ b/contrib/lua-torch/nn/HardTanh.lua
@@ -0,0 +1,37 @@
+local HardTanh, parent = torch.class('nn.HardTanh', 'nn.Module')
+
+function HardTanh:__init(min_value, max_value, inplace)
+   parent.__init(self)
+   self.min_val = min_value or -1
+   self.max_val = max_value or 1
+   self.inplace = inplace or false
+   if (inplace and type(inplace) ~= 'boolean') then
+      error('in-place flag must be boolean')
+   end
+   assert(self.max_val>self.min_val, 'max_value must be larger than min_value')
+end
+
+function HardTanh:updateOutput(input)
+   self.min_val = self.min_val or -1
+   self.max_val = self.max_val or 1
+   input.THNN.HardTanh_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.min_val,
+      self.max_val,
+      self.inplace or false
+   )
+   return self.output
+end
+
+function HardTanh:updateGradInput(input, gradOutput)
+   input.THNN.HardTanh_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.min_val,
+      self.max_val,
+      self.inplace or false
+   )
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/HingeEmbeddingCriterion.lua b/contrib/lua-torch/nn/HingeEmbeddingCriterion.lua
new file mode 100644
index 000000000..13ad00f19
--- /dev/null
+++ b/contrib/lua-torch/nn/HingeEmbeddingCriterion.lua
@@ -0,0 +1,43 @@
+local HingeEmbeddingCriterion, parent = torch.class('nn.HingeEmbeddingCriterion', 'nn.Criterion')
+
+function HingeEmbeddingCriterion:__init(margin)
+   parent.__init(self)
+   self.margin = margin or 1
+   self.sizeAverage = true
+end
+
+function HingeEmbeddingCriterion:updateOutput(input,y)
+   self.buffer = self.buffer or input.new()
+   if not torch.isTensor(y) then
+      self.ty = self.ty or input.new():resize(1)
+      self.ty[1]=y
+      y=self.ty
+   end
+
+   self.buffer:resizeAs(input):copy(input)
+   self.buffer[torch.eq(y, -1)] = 0
+   self.output = self.buffer:sum()
+
+   self.buffer:fill(self.margin):add(-1, input)
+   self.buffer:cmax(0)
+   self.buffer[torch.eq(y, 1)] = 0
+   self.output = self.output + self.buffer:sum()
+
+   if (self.sizeAverage == nil or self.sizeAverage == true) then
+      self.output = self.output / input:nElement()
+   end
+
+   return self.output
+end
+
+function HingeEmbeddingCriterion:updateGradInput(input, y)
+   if not torch.isTensor(y) then self.ty[1]=y; y=self.ty end
+   self.gradInput:resizeAs(input):copy(y)
+   self.gradInput[torch.cmul(torch.eq(y, -1), torch.gt(input, self.margin))] = 0
+
+   if (self.sizeAverage == nil or self.sizeAverage == true) then
+      self.gradInput:mul(1 / input:nElement())
+   end
+
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/Identity.lua b/contrib/lua-torch/nn/Identity.lua
new file mode 100644
index 000000000..5e6ccb624
--- /dev/null
+++ b/contrib/lua-torch/nn/Identity.lua
@@ -0,0 +1,30 @@
+local Identity, _ = torch.class('nn.Identity', 'nn.Module')
+
+function Identity:updateOutput(input)
+   self.output = input
+   return self.output
+end
+
+
+function Identity:updateGradInput(input, gradOutput)
+   self.gradInput = gradOutput
+   return self.gradInput
+end
+
+function Identity:clearState()
+   -- don't call set because it might reset referenced tensors
+   local function clear(f)
+      if self[f] then
+         if torch.isTensor(self[f]) then
+            self[f] = self[f].new()
+         elseif type(self[f]) == 'table' then
+            self[f] = {}
+         else
+            self[f] = nil
+         end
+      end
+   end
+   clear('output')
+   clear('gradInput')
+   return self
+end
diff --git a/contrib/lua-torch/nn/Index.lua b/contrib/lua-torch/nn/Index.lua
new file mode 100644
index 000000000..6aa429708
--- /dev/null
+++ b/contrib/lua-torch/nn/Index.lua
@@ -0,0 +1,32 @@
+local Index, parent = torch.class('nn.Index', 'nn.Module')
+
+function Index:__init(dimension)
+    parent.__init(self)
+    self.dimension = dimension
+    self.gradInput = {self.gradInput, self.gradInput.new()}
+end
+
+function Index:updateOutput(input)
+    local t = input[1]
+    local index = input[2]
+    self.output:index(t, self.dimension, index)
+    return self.output
+end
+
+function Index:updateGradInput(input, gradOutput)
+    local t = input[1]
+    local index = input[2]
+
+    self.gradInput[2]:resize(index:size()):zero()
+    local gradInput = self.gradInput[1] -- no gradient for the index variable
+    gradInput:resizeAs(t):zero()
+    gradInput:indexAdd(self.dimension, index, gradOutput)
+    return self.gradInput
+end
+
+function Index:clearState()
+    self.gradInput[1]:set()
+    self.gradInput[2]:set()
+    self.output:set()
+    return self
+end
diff --git a/contrib/lua-torch/nn/IndexLinear.lua b/contrib/lua-torch/nn/IndexLinear.lua
new file mode 100644
index 000000000..928e5d3f2
--- /dev/null
+++ b/contrib/lua-torch/nn/IndexLinear.lua
@@ -0,0 +1,398 @@
+local ffi  = require 'ffi'
+local IndexLinear, parent = torch.class('nn.IndexLinear', 'nn.Module')
+
+
+
+function IndexLinear:__init(inputSize, outputSize, doGradInput, keysOffset, weight, bias, normalize)
+   parent.__init(self)
+
+   -- We need for 3 extra parameters per feature
+   -- if we normalize:
+   -- * The max-abs value
+   -- * The inverse of the max-abs value
+   -- * The per-feature bias
+   -- We keep an extra placeholder for further per learning rate feature manipulation.
+   -- So it's 4 total.
+   self.normalize = normalize and 4 or 0
+
+   -- This is important to keep the possibility of sharing a weight
+   -- directly, without having to allocate it first.
+   -- The reason is these weights can be very large.
+   self.weight = weight or torch.Tensor(inputSize, outputSize + self.normalize):zero()
+   self.bias = bias or torch.Tensor(outputSize):zero()
+   self.inputSize = self.weight and self.weight:size(1) or inputSize
+   self.outputSize = self.weight and (self.weight:size(2)-self.normalize) or outputSize
+
+   -- gradWeight is not initialized as we're doing dense gradient accumulation
+   -- This is more efficient and avoids allocating a giant useless gradWeight
+   self.gradWeight = torch.Tensor()
+
+   -- gradBias still works the same as it's already dense
+   self.gradBias = torch.Tensor(self.outputSize):zero()
+
+   -- Buffers
+   self.gradWeightBuffer = torch.Tensor()
+   self.valuesBuffer = torch.Tensor()
+   self.normalizedValues = torch.Tensor()
+
+   -- That is used to accumulate keys and gradWeight
+   -- when doing gradients accumulations
+   self.running = {
+      cumSumSizes = {},
+      keys = {},
+      gradWeight = {},
+      counter = 1,
+   }
+
+   -- self.sizes, self.cumSumSizes are calculated on the CPU even when using CUDA.
+   -- These two tables make it easier to resize these buffers instead of re-allocating them.
+   -- self.*Cache[1] always contains values on CPU.
+   -- If CUDA is being used, self.*Cache[2] contains values on GPU.
+   self.sizesCache = {}
+   self.cumSumSizesCache = {}
+
+   -- A few options
+   self.weightDecay = 0
+   self.doGradInput = doGradInput or false
+   self.offset = keysOffset and keysOffset-1 or -1 -- if this adds self.offset to indices
+end
+
+-- Reset all the parameters needed
+-- for normalization to 0
+function IndexLinear:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1./math.sqrt(self.weight:size(2))
+   end
+   self.weight:uniform(-stdv, stdv)
+   self.bias:uniform(-stdv, stdv):mul(0.000001)
+   if self.normalize and self.normalize > 0 then
+      self.weight[{{}, {1,self.normalize}}]:zero()
+   end
+end
+
+function IndexLinear:reshapeInput(input)
+   assert(type(input) == 'table')
+
+   local ninputs = 0
+   for _, v in ipairs(input) do
+      ninputs = ninputs + 1
+   end
+
+   assert(ninputs == 2 or ninputs == 3)
+
+   -- If format is:
+   -- {
+   --   torch.LongTensor(size1+size2+...+sizeN), -- concatenated batch of keys
+   --   torch.Tensor(size1+size2+...+sizeN), -- concatenated batch of values
+   --   torch.LongTensor(N), -- keys/values sizes (values are {size1, ..., sizeN})
+   -- }
+   if ninputs == 3 then
+      local fkeys = input[1]
+      local fvals = input[2]
+      local fsizes = torch.isTensor(input[3]) and input[3] or fkeys.new{input[3]}
+      assert(fkeys:nElement() == fvals:nElement(), 'Keys and values should be of same size')
+      assert(fkeys:dim() == 1, 'Keys and values should be 1D')
+      self.isFlat = true
+      self.noBatch = false
+      return fkeys, fvals, fsizes
+   end
+
+   local keys = input[1]
+   local values = input[2]
+   local lkeys, lvalues
+
+   -- If format is:
+   -- {
+   --   { torch.LongTensor(size1), torch.LongTensor(size2), ..., torch.LongTensor(sizeN) }, -- batch of keys
+   --   { torch.Tensor(size1), torch.Tensor(size2), ..., torch.Tensor(sizeN) }, -- batch of values,
+   -- }
+   if type(keys) == 'table' and type(values) == 'table' then
+      lkeys, lvalues = keys, values
+      self.isFlat = false
+      self.noBatch = false
+
+   -- If format is not a batch:
+   -- {
+   --   torch.LongTensor(size1), -- keys
+   --   torch.Tensor(size1), -- values,
+   -- }
+   elseif torch.isTensor(keys) and torch.isTensor(values) then
+      lkeys, lvalues = {keys}, {values}
+      self.isFlat = false
+      self.noBatch = true
+   else
+      error('Wrong input format.')
+   end
+
+   for i=1,#lkeys do
+      assert(lvalues[i]:dim() == 1 and lkeys[i]:dim() == 1, "keys and values should be 1D")
+   end
+
+   return lkeys, lvalues
+end
+
+function IndexLinear:longTensor(...)
+   if (self:type() == 'torch.CudaTensor') then
+      return torch.CudaLongTensor(...)
+   else
+      return torch.LongTensor(...)
+   end
+end
+
+function IndexLinear:flattenInputs(input)
+   local lkeys, lvalues, sizes = self:reshapeInput(input)
+
+   local counter = self.running.counter
+
+   -- Ensure everything is of the right type
+   local isCuda = (self:type() == 'torch.CudaTensor')
+   self.running.keys[counter] = self.running.keys[counter] or self:longTensor()
+   self.keys = self.running.keys[counter]
+
+   if self.isFlat then
+      self.values = self.values or lvalues.new()
+      self.sizes = self.sizes or self:longTensor()
+
+      self.keys:resize(lkeys:size()):copy(lkeys)
+      self.values:resize(lvalues:size()):copy(lvalues)
+      self.sizes = sizes
+      self.cumSumSizes = self.cumSumSizes or self.sizes.new()
+      self.cumSumSizes:cumsum(self.sizes)
+   else
+      self.values = self.values or lvalues[1].new()
+
+      self.lkeys = lkeys
+      self.lvalues = lvalues
+      local batchSize = #self.lkeys
+
+      self.sizesCache[1] = self.sizesCache[1] or torch.LongTensor(batchSize)
+      self.cumSumSizesCache[1] = self.cumSumSizesCache[1] or torch.LongTensor(batchSize)
+
+      self.sizes = self.sizesCache[1]
+      self.cumSumSizes = self.cumSumSizesCache[1]
+
+      self.sizes:resize(batchSize)
+      self.cumSumSizes:resize(batchSize)
+
+      for i = 1,batchSize do
+         self.sizes[i] = self.lkeys[i]:size(1)
+      end
+      self.cumSumSizes:cumsum(self.sizes)
+
+      self.keys:cat(self.lkeys, 1)
+      self.values:cat(self.lvalues, 1)
+
+      if isCuda then
+         -- Get the GPU cache
+         self.sizesCache[2] = self.sizesCache[2] or torch.CudaLongTensor()
+         self.cumSumSizesCache[2] = self.cumSumSizesCache[2] or torch.CudaLongTensor()
+
+         self.sizes = self.sizesCache[2]
+         self.cumSumSizes = self.cumSumSizesCache[2]
+
+         -- Resize and copy to GPU
+         self.sizes:resize(batchSize):copy(self.sizesCache[1])
+         self.cumSumSizes:resize(batchSize):copy(self.cumSumSizesCache[1])
+      end
+   end
+   self.running.cumSumSizes[counter] = self.cumSumSizes
+end
+
+function IndexLinear:updateOutput(input)
+
+   self:flattenInputs(input)
+
+   self.values.THNN.IndexLinear_updateOutput(
+      self.keys:cdata(),
+      self.offset,
+      self.values:cdata(),
+      self.sizes:cdata(),
+      self.cumSumSizes:cdata(),
+      self.output:cdata(),
+      self.weight:cdata(),
+      self.bias:cdata(),
+      self.normalizedValues:cdata(),
+      self.train and 1 or 0
+      )
+
+   if self.noBatch then
+      self.output:resize(self.output:size(2))
+   end
+   return self.output
+end
+
+function IndexLinear:accUpdateGradParameters(input, gradOutput, scale)
+   self.values.THNN.IndexLinear_accUpdateGradParameters(
+      self.keys:cdata(),
+      self.offset,
+      self.normalize > 0 and self.normalizedValues:cdata() or self.values:cdata(),
+      self.sizes:cdata(),
+      self.cumSumSizes:cdata(),
+      gradOutput:cdata(),
+      self.weight:cdata(),
+      self.bias:cdata(),
+      self.weightDecay or 0,
+      scale or 1
+   )
+end
+
+function IndexLinear:accGradParameters(input, gradOutput, scale)
+
+   local counter = self.running.counter
+
+   -- Same as the running.keys in the updateOutput function,
+   -- get a table of dense running.gradWeight
+   self.running.gradWeight[counter] = self.running.gradWeight[counter] or self.values.new()
+   self.values.THNN.IndexLinear_accGradParameters(
+      self.keys:cdata(),
+      self.offset,
+      self.normalize > 0 and self.normalizedValues:cdata() or self.values:cdata(),
+      self.sizes:cdata(),
+      self.cumSumSizes:cdata(),
+      gradOutput:cdata(),
+      self.running.gradWeight[counter]:cdata(),
+      self.gradBias:cdata(),
+      self.weight:cdata(),
+      self.bias:cdata(),
+      self.valuesBuffer:cdata(),
+      self.weightDecay or 0,
+      scale or 1
+   )
+
+   -- Increment the running counter to create a new buffer
+   -- if we don't flush them in zerogradParameters
+   self.running.counter = self.running.counter + 1
+end
+
+function IndexLinear:updateGradInput(input, gradOutput)
+   self.gradInput = {}
+   -- Revamped from nn.SparseLinear.updateGradInput
+   if self.doGradInput and self.normalize > 0 then
+      error('updateGradInput is not implemented in max-normalize mode')
+   end
+
+   local ini = self.weight:size(1)
+
+   if self.doGradInput then
+      local gi = gradOutput.new()
+      if gradOutput:dim() == 1 then
+         gi:resize(self.weight:size(1))
+         gi:mv(self.weight,gradOutput)
+         gi:resize(1, self.weight:size(1))
+      elseif gradOutput:dim() == 2 then
+         gi:resize(gradOutput:size(1), self.weight:size(1))
+         gi:mm(gradOutput, self.weight:t())
+      end
+
+      local indices = self.running.keys[1].new(ini):range(1, ini)
+
+      if self.isFlat then
+         self.gradInput[1] = torch.repeatTensor(indices, gi:size(1), 1)
+         self.gradInput[2] = gi
+      else
+         self.gradInput[1] = {}
+         self.gradInput[2] = {}
+         for i = 1,gi:size(1) do
+            self.gradInput[1][i] = self.running.keys[1].new(ini)
+            self.gradInput[1][i]:copy(indices)
+            self.gradInput[2][i] = gradOutput.new(ini)
+            self.gradInput[2][i]:copy(gi[i])
+         end
+      end
+   end
+
+   if self.noBatch then
+      if self.isFlat then
+         self.gradInput = {self.gradInput[1]:resize(ini), self.gradInput[2]:resize(ini)}
+      else
+         self.gradInput = {self.gradInput[1][1], self.gradInput[2][1]}
+      end
+   end
+   return self.gradInput
+end
+
+function IndexLinear:updateParameters(lr)
+   local counter = self.running.counter
+   if counter > 1 then
+      if counter == 2 then
+         self.updateKeys = self.running.keys[1]
+         self.gradWeight = self.running.gradWeight[1]
+      else
+         self.updateKeysBuffer = self.updateKeysBuffer or self:longTensor()
+         local lkeys = {}
+         local lgweights = {}
+         local totalSize = 0
+         local lCumSumSizes = {}
+         for i=1,counter-1 do
+            lkeys[i] = self.running.keys[i]
+            -- Change layout to take advantage of the 1-D contiguous torch.cat
+            lgweights[i] = self.running.gradWeight[i]:contiguous()
+            lgweights[i]:resize(lgweights[i]:nElement())
+            lCumSumSizes[i] = totalSize + self.running.cumSumSizes[i]
+            totalSize = totalSize + lkeys[i]:size(1)
+         end
+
+         self.updateKeysBuffer:cat(lkeys, 1)
+         self.gradWeightBuffer:cat(lgweights, 1)
+         self.cumSumSizes:cat(lCumSumSizes, 1)
+         self.gradWeightBuffer:resize(totalSize, self.outputSize)
+         self.gradWeight = self.gradWeightBuffer
+         self.updateKeys = self.updateKeysBuffer
+      end
+      self.values.THNN.IndexLinear_updateParameters(
+            self.gradWeight:cdata(),
+            self.gradBias:cdata(),
+            self.weight:cdata(),
+            self.bias:cdata(),
+            self.updateKeys:cdata(),
+            self.cumSumSizes:cdata(),
+            self.offset,
+            self.weightDecay or 0,
+            lr or error('You must specify a learning rate')
+         )
+   end
+end
+
+function IndexLinear:zeroGradParameters()
+   -- No need to do anything here as gradWeight is dense
+   self.gradBias:zero()
+
+   -- The below piece of code would reset
+   -- the smart scaling parameters for each features
+   -- each time we call zeroGradParameters
+   -- TODO: decide what to do with that piece of code.
+   -- NB: this should be commented along with the corresponding
+   -- piece of code in lib/THNN/generic/IndexLinear.c, in the accUpdateGradParameters function.
+
+   --[[
+   local w = self.weight:select(2, 3)
+   if self.updateKeys and self.updateKeys:nElement() > 0 then
+      self.updateKeysBuffer:resizeAs(self.updateKeys):copy(self.updateKeys):add(self.offset+1)
+      w:indexFill(1, self.updateKeysBuffer, 0)
+   end
+   ]]--
+   self.running.counter = 1
+end
+
+function IndexLinear:parameters()
+   return {self.weight, self.bias}, {self.running, self.gradBias}
+end
+
+function IndexLinear:clearState()
+   self.running.keys = {}
+   self.running.gradWeight = {}
+   self.keys = nil
+   self.zerokeys = nil
+   self.updateKeys = nil
+   self.values = nil
+   self.sizes = nil
+   self.lkeys = {}
+   self.lvalues = {}
+   self.gradWeightBuffer = self.gradWeightBuffer.new()
+   self.valuesBuffer = self.valuesBuffer.new()
+   self.updateKeysBuffer = nil
+   self.values = nil
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/Jacobian.lua b/contrib/lua-torch/nn/Jacobian.lua
new file mode 100644
index 000000000..4f728b18c
--- /dev/null
+++ b/contrib/lua-torch/nn/Jacobian.lua
@@ -0,0 +1,389 @@
+nn.Jacobian = {}
+
+function nn.Jacobian.backward(module, input, param, dparam)
+   local doparam = 0
+   if param then
+      doparam = 1
+   end
+   param = param or input
+   -- output deriv
+   module:forward(input)
+   local dout = module.output.new():resizeAs(module.output)
+   -- 1D view
+   local sdout = module.output.new(dout:storage(),1,dout:nElement())
+   -- jacobian matrix to calculate
+   local jacobian = torch.Tensor(param:nElement(),dout:nElement()):zero()
+
+   for i=1,sdout:nElement() do
+      dout:zero()
+      sdout[i] = 1
+      module:zeroGradParameters()
+      local din = module:updateGradInput(input, dout)
+      module:accGradParameters(input, dout)
+      if doparam == 1 then
+         jacobian:select(2,i):copy(dparam)
+      else
+         jacobian:select(2,i):copy(din)
+      end
+   end
+   return jacobian
+end
+
+function nn.Jacobian.backwardUpdate(module, input, param)
+
+   -- output deriv
+   module:forward(input)
+   local dout = module.output.new():resizeAs(module.output)
+   -- 1D view
+   local sdout = module.output.new(dout:storage(),1,dout:nElement())
+   -- jacobian matrix to calculate
+   local jacobian = torch.Tensor(param:nElement(),dout:nElement()):zero()
+
+   -- original param
+   local params = module:parameters()
+   local origparams = {}
+   for j=1,#params do
+      table.insert(origparams, params[j]:clone())
+   end
+
+   for i=1,sdout:nElement() do
+      for j=1,#params do
+         params[j]:copy(origparams[j])
+      end
+      dout:zero()
+      sdout[i] = 1
+      module:updateGradInput(input, dout)
+      module:accUpdateGradParameters(input, dout, 1)
+      jacobian:select(2,i):copy(param)
+   end
+
+   for j=1,#params do
+      params[j]:copy(origparams[j])
+   end
+
+   return jacobian
+end
+
+function nn.Jacobian.forward(module, input, param, perturbation)
+   param = param or input
+   -- perturbation amount
+   perturbation = perturbation or 1e-6
+   -- 1D view of input
+   --local tst = param:storage()
+   local sin = param.new(param):resize(param:nElement())--param.new(tst,1,tst:size())
+   -- jacobian matrix to calculate
+   local jacobian = torch.Tensor():resize(param:nElement(),module:forward(input):nElement())
+
+   local outa = torch.Tensor(jacobian:size(2))
+   local outb = torch.Tensor(jacobian:size(2))
+
+   for i=1,sin:nElement() do
+      local orig = sin[i]
+      sin[i] = orig - perturbation
+      outa:copy(module:forward(input))
+      sin[i] = orig + perturbation
+      outb:copy(module:forward(input))
+      sin[i] = orig
+
+      outb:add(-1,outa):div(2*perturbation)
+      jacobian:select(1,i):copy(outb)
+   end
+
+   return jacobian
+end
+
+function nn.Jacobian.backwardDiagHessian(module, input, diagHessianParamName)
+   -- Compute the second derivatives (diagonal Hessian elements)
+   -- by backpropagation (using the code from hessian.lua).
+   --
+   -- This function computes the diagonal Hessian elements of the following function:
+   --
+   -- F(x_1, x_2, ..., x_n) = y_1^2/2 + y_2^2/2 + ... + y_m^2/2,
+   --
+   -- where
+   -- x_1, ..., x_n are the input values and parameters of the given module,
+   -- y_1, ..., y_m are the output values of the given module.
+   --
+   -- All x_i and y_i values are scalars here. In other words,
+   -- x_1, ..., x_n denote the scalar elements of the module input tensor,
+   --             the scalar elements of module.weight,
+   --             and the scalar elements of module.bias;
+   -- y_1, ..., y_m are the scalar elements of the module output tensor.
+   --
+   -- The diagonal Hessian elements of F are computed with respect to
+   -- the module input values and parameters (x_1, .., x_n).
+   --
+   -- The function F is chosen for its convenient properties:
+   --
+   -- dF / dy_i = y_i,
+   -- d^2F / dy_i^2 = 1.
+   --
+   -- In other words, the diagonal Hessian elements of F with respect
+   -- to the module OUTPUT values (y_1, ... y_m) are equal to 1.
+   --
+   -- Because of that, computing the diagonal Hessian elements of F
+   -- with respect to the module INPUT values and PARAMETERS (x_1, ..., x_n)
+   -- can be done by calling updateDiagHessianInput() and accDiagHessianParameters()
+   -- using a tensor of ones as diagHessianOutput.
+
+   module:forward(input)
+   local diagHessianOutput = module.output.new():resizeAs(module.output):fill(1)
+
+   module.diagHessianWeight:zero()
+   module.diagHessianBias:zero()
+   module:updateDiagHessianInput(input, diagHessianOutput)
+   module:accDiagHessianParameters(input, diagHessianOutput)
+
+   return module[diagHessianParamName]
+end
+
+function nn.Jacobian.linearModuleDiagHessian(module, input, gradParamName)
+   -- Compute the second derivatives (diagonal Hessian elements)
+   -- from the first derivatives for the given module
+   -- (without using the code from hessian.lua).
+   --
+   -- The given module is assumed to be linear with respect to its inputs and weights
+   -- (like nn.Linear, nn.SpatialConvolution, etc.)
+   --
+   -- This function computes the diagonal Hessian elements of the following function:
+   --
+   -- F(x_1, x_2, ..., x_n) = y_1^2/2 + y_2^2/2 + ... + y_m^2/2.
+   --
+   -- (See the the comment for nn.Jacobian.backwardDiagHessian() for explanation.)
+   --
+   -- The first derivatives of F with respect to
+   -- the module inputs and parameters (x_1, ..., x_n) are:
+   --
+   -- dF / dx_i = \sum_k (dF / dy_k) (dy_k / dx_i).
+   --
+   -- The second derivatives are:
+   --
+   -- d^2F / dx_i = \sum_k [(d^2F / dy_k^2) (dy_k / dx_i)^2 + (dF / dy_k) (d^2y_k / dx_i^2)].
+   --
+   -- The second derivatives of F with respect to the module outputs (y_1, ..., y_m)
+   -- are equal to 1, so:
+   --
+   -- d^2F / dx_i = \sum_k [(dy_k / dx_i)^2 + (dF / dy_k) (d^2y_k / dx_i^2)].
+   --
+   -- Assuming the linearity of module outputs (y_1, ..., y_m)
+   -- with respect to module inputs and parameters (x_1, ..., x_n),
+   -- we have (d^2y_k / dx_i^2) = 0,
+   -- and the expression finally becomes:
+   --
+   -- d^2F / dx_i = \sum_k (dy_k / dx_i)^2.
+   --
+   -- The first derivatives (dy_k / dx_i) are computed by normal backpropagation,
+   -- using updateGradInput() and accGradParameters().
+
+   local gradParam = module[gradParamName]
+
+   local diagHessian = gradParam.new():resize(gradParam:nElement()):zero()
+
+   module:forward(input)
+   local gradOutput = module.output.new():resizeAs(module.output)
+   local gradOutput1D = gradOutput:view(gradOutput:nElement())
+
+   for i=1,gradOutput:nElement() do
+      gradOutput1D:zero()
+      gradOutput1D[i] = 1
+      module.gradWeight:zero()
+      if module.bias then
+         module.gradBias:zero()
+      end
+      module:updateGradInput(input, gradOutput)
+      module:accGradParameters(input, gradOutput)
+      diagHessian:addcmul(gradParam, gradParam)
+   end
+
+   return diagHessian
+end
+
+function nn.Jacobian.forwardUpdate(module, input, param, perturbation)
+   -- perturbation amount
+   perturbation = perturbation or 1e-6
+   -- 1D view of input
+   --local tst = param:storage()
+   local sin =  param.new(param):resize(param:nElement())--param.new(tst,1,tst:size())
+   -- jacobian matrix to calculate
+   local jacobian = torch.Tensor():resize(param:nElement(),module:forward(input):nElement())
+
+   local outa = torch.Tensor(jacobian:size(2))
+   local outb = torch.Tensor(jacobian:size(2))
+
+   for i=1,sin:nElement() do
+      local orig = sin[i]
+      sin[i] = orig - perturbation
+      outa:copy(module:forward(input))
+      sin[i] = orig + perturbation
+      outb:copy(module:forward(input))
+      sin[i] = orig
+
+      outb:add(-1,outa):div(2*perturbation)
+      jacobian:select(1,i):copy(outb)
+      jacobian:select(1,i):mul(-1)
+      jacobian:select(1,i):add(sin[i])
+   end
+   return jacobian
+end
+
+function nn.Jacobian.testJacobian(module, input, minval, maxval, perturbation)
+   minval = minval or -2
+   maxval = maxval or 2
+   local inrange = maxval - minval
+   input:copy(torch.rand(input:nElement()):mul(inrange):add(minval))
+   local jac_fprop = nn.Jacobian.forward(module, input, input, perturbation)
+   local jac_bprop = nn.Jacobian.backward(module, input)
+   local error = jac_fprop-jac_bprop
+   return error:abs():max()
+end
+
+function nn.Jacobian.testJacobianParameters(module, input, param, dparam, minval, maxval, perturbation)
+   minval = minval or -2
+   maxval = maxval or 2
+   local inrange = maxval - minval
+   input:copy(torch.rand(input:nElement()):mul(inrange):add(minval))
+   param:copy(torch.rand(param:nElement()):mul(inrange):add(minval))
+   local jac_bprop = nn.Jacobian.backward(module, input, param, dparam)
+   local jac_fprop = nn.Jacobian.forward(module, input, param, perturbation)
+   local error = jac_fprop - jac_bprop
+   return error:abs():max()
+end
+
+function nn.Jacobian.testJacobianUpdateParameters(module, input, param, minval, maxval, perturbation)
+   minval = minval or -2
+   maxval = maxval or 2
+   local inrange = maxval - minval
+   input:copy(torch.rand(input:nElement()):mul(inrange):add(minval))
+   param:copy(torch.rand(param:nElement()):mul(inrange):add(minval))
+   local params_bprop = nn.Jacobian.backwardUpdate(module, input, param)
+   local params_fprop = nn.Jacobian.forwardUpdate(module, input, param, perturbation)
+
+   local error = params_fprop - params_bprop
+   return error:abs():max()
+end
+
+function nn.Jacobian.testDiagHessian(module, input, gradParamName, diagHessianParamName, minval, maxval)
+   -- Compute the diagonal Hessian elements for the same function in two different ways,
+   -- then compare the results and return the difference.
+
+   minval = minval or -2
+   maxval = maxval or 2
+   local inrange = maxval - minval
+   input:copy(torch.rand(input:nElement()):mul(inrange):add(minval))
+   module:initDiagHessianParameters()
+   local h_bprop = nn.Jacobian.backwardDiagHessian(module, input, diagHessianParamName)
+   local h_linearmodule = nn.Jacobian.linearModuleDiagHessian(module, input, gradParamName)
+   local error = h_bprop - h_linearmodule
+   return error:abs():max()
+end
+
+function nn.Jacobian.testDiagHessianInput(module, input, minval, maxval)
+   return nn.Jacobian.testDiagHessian(module, input, 'gradInput', 'diagHessianInput', minval, maxval)
+end
+
+function nn.Jacobian.testDiagHessianWeight(module, input, minval, maxval)
+   return nn.Jacobian.testDiagHessian(module, input, 'gradWeight', 'diagHessianWeight', minval, maxval)
+end
+
+function nn.Jacobian.testDiagHessianBias(module, input, minval, maxval)
+   return nn.Jacobian.testDiagHessian(module, input, 'gradBias', 'diagHessianBias', minval, maxval)
+end
+
+function nn.Jacobian.testIO(module,input, minval, maxval)
+   minval = minval or -2
+   maxval = maxval or 2
+   local inrange = maxval - minval
+   local inputclone = input:clone()
+
+   -- run module
+   module:forward(input)
+   local go = module.output:clone():copy(torch.rand(module.output:nElement()):mul(inrange):add(minval))
+   local goclone = go:clone()
+   module:zeroGradParameters()
+   module:updateGradInput(input,go)
+   module:accGradParameters(input,go)
+
+   local fo = module.output:clone()
+   local bo = module.gradInput:clone()
+
+   -- write module
+   local filename = os.tmpname()
+   local f = torch.DiskFile(filename, 'w'):binary()
+   -- call clearState and check that it returns itself
+   assert(module == module:clearState(),'clearState did not return self')
+   f:writeObject(module)
+   f:close()
+   -- read module
+   local m = torch.DiskFile(filename):binary():readObject()
+   m:forward(inputclone)
+   m:zeroGradParameters()
+   m:updateGradInput(inputclone,goclone)
+   m:accGradParameters(inputclone,goclone)
+   -- cleanup
+   os.remove(filename)
+
+   local fo2 = m.output:clone()
+   local bo2 = m.gradInput:clone()
+
+   local errf = fo - fo2
+   local errb = bo - bo2
+   return errf:abs():max(), errb:numel() == 0 and 0 or errb:abs():max()
+end
+
+function nn.Jacobian.testAllUpdate(module, input, weight, gradWeight)
+   local gradOutput
+   local lr = torch.uniform(0.1, 1)
+   local errors = {}
+
+   -- accGradParameters
+   local maccgp = module:clone()
+   local weightc = maccgp[weight]:clone()
+   maccgp:forward(input)
+   gradOutput = torch.rand(maccgp.output:size())
+   maccgp:zeroGradParameters()
+   maccgp:updateGradInput(input, gradOutput)
+   maccgp:accGradParameters(input, gradOutput)
+   maccgp:updateParameters(lr)
+   errors["accGradParameters"] = (weightc-maccgp[gradWeight]*lr-maccgp[weight]):norm()
+
+   -- accUpdateGradParameters
+   local maccugp = module:clone()
+   maccugp:forward(input)
+   maccugp:updateGradInput(input, gradOutput)
+   maccugp:accUpdateGradParameters(input, gradOutput, lr)
+   errors["accUpdateGradParameters"] = (maccugp[weight]-maccgp[weight]):norm()
+
+   -- shared, accGradParameters
+   local macsh1 = module:clone()
+   local macsh2 = module:clone()
+   macsh2:share(macsh1, weight)
+   macsh1:forward(input)
+   macsh2:forward(input)
+   macsh1:zeroGradParameters()
+   macsh2:zeroGradParameters()
+   macsh1:updateGradInput(input, gradOutput)
+   macsh2:updateGradInput(input, gradOutput)
+   macsh1:accGradParameters(input, gradOutput)
+   macsh2:accGradParameters(input, gradOutput)
+   macsh1:updateParameters(lr)
+   macsh2:updateParameters(lr)
+   local err = (weightc-maccgp[gradWeight]*(lr*2)-macsh1[weight]):norm()
+   err = err + (weightc-maccgp[gradWeight]*(lr*2)-macsh2[weight]):norm()
+   errors["accGradParameters [shared]"] = err
+
+   -- shared, accUpdateGradParameters
+   local macshu1 = module:clone()
+   local macshu2 = module:clone()
+   macshu2:share(macshu1, weight)
+   macshu1:forward(input)
+   macshu2:forward(input)
+   macshu1:updateGradInput(input, gradOutput)
+   macshu2:updateGradInput(input, gradOutput)
+   macshu1:accUpdateGradParameters(input, gradOutput, lr)
+   macshu2:accUpdateGradParameters(input, gradOutput, lr)
+   err = (weightc-maccgp[gradWeight]*(lr*2)-macshu1[weight]):norm()
+   err = err + (weightc-maccgp[gradWeight]*(lr*2)-macshu2[weight]):norm()
+   errors["accUpdateGradParameters [shared]"] = err
+
+   return errors
+end
diff --git a/contrib/lua-torch/nn/JoinTable.lua b/contrib/lua-torch/nn/JoinTable.lua
new file mode 100644
index 000000000..6ab68e189
--- /dev/null
+++ b/contrib/lua-torch/nn/JoinTable.lua
@@ -0,0 +1,74 @@
+local JoinTable, parent = torch.class('nn.JoinTable', 'nn.Module')
+
+function JoinTable:__init(dimension, nInputDims)
+   parent.__init(self)
+   self.size = torch.LongStorage()
+   self.dimension = dimension
+   self.gradInput = {}
+   self.nInputDims = nInputDims
+end
+
+function JoinTable:_getPositiveDimension(input)
+   local dimension = self.dimension
+   if dimension < 0 then
+      dimension = input[1]:dim() + dimension + 1
+   elseif self.nInputDims and input[1]:dim()==(self.nInputDims+1) then
+      dimension = dimension + 1
+   end
+   return dimension
+end
+
+function JoinTable:updateOutput(input)
+   local dimension = self:_getPositiveDimension(input)
+
+   for i=1,#input do
+      local currentOutput = input[i]
+      if i == 1 then
+         self.size:resize(currentOutput:dim()):copy(currentOutput:size())
+      else
+         self.size[dimension] = self.size[dimension]
+            + currentOutput:size(dimension)
+      end
+   end
+   self.output:resize(self.size)
+
+   local offset = 1
+   for i=1,#input do
+      local currentOutput = input[i]
+      self.output:narrow(dimension, offset,
+         currentOutput:size(dimension)):copy(currentOutput)
+      offset = offset + currentOutput:size(dimension)
+   end
+   return self.output
+end
+
+function JoinTable:updateGradInput(input, gradOutput)
+   local dimension = self:_getPositiveDimension(input)
+
+   for i=1,#input do
+      if self.gradInput[i] == nil then
+         self.gradInput[i] = input[i].new()
+      end
+      self.gradInput[i]:resizeAs(input[i])
+   end
+
+   -- clear out invalid gradInputs
+   for i=#input+1, #self.gradInput do
+      self.gradInput[i] = nil
+   end
+
+   local offset = 1
+   for i=1,#input do
+      local currentOutput = input[i]
+      local currentGradInput = gradOutput:narrow(dimension, offset,
+                      currentOutput:size(dimension))
+      self.gradInput[i]:copy(currentGradInput)
+      offset = offset + currentOutput:size(dimension)
+   end
+   return self.gradInput
+end
+
+function JoinTable:type(type, tensorCache)
+   self.gradInput = {}
+   return parent.type(self, type, tensorCache)
+end
diff --git a/contrib/lua-torch/nn/Kmeans.lua b/contrib/lua-torch/nn/Kmeans.lua
new file mode 100644
index 000000000..56066b63d
--- /dev/null
+++ b/contrib/lua-torch/nn/Kmeans.lua
@@ -0,0 +1,215 @@
+-- Online (Hard) Kmeans layer.
+local Kmeans, parent = torch.class('nn.Kmeans', 'nn.Module')
+
+function Kmeans:__init(k, dim, scale)
+   parent.__init(self)
+   self.k = k
+   self.dim = dim
+
+   -- scale for online kmean update
+   self.scale = scale
+
+   assert(k > 0, "Clusters cannot be 0 or negative.")
+   assert(dim > 0, "Dimensionality cannot be 0 or negative.")
+
+   -- Kmeans centers -> self.weight
+   self.weight = torch.Tensor(self.k, self.dim)
+
+   self.gradWeight = torch.Tensor(self.weight:size())
+   self.loss = 0 -- within cluster error of the last forward
+
+   self.clusterSampleCount = torch.Tensor(self.k)
+
+   self:reset()
+end
+
+-- Reset
+function Kmeans:reset(stdev)
+   stdev = stdev or 1
+   self.weight:uniform(-stdev, stdev)
+end
+
+-- Initialize Kmeans weight with random samples from input.
+function Kmeans:initRandom(input)
+   local inputDim = input:nDimension()
+   assert(inputDim == 2, "Incorrect input dimensionality. Expecting 2D.")
+
+   local noOfSamples = input:size(1)
+   local dim = input:size(2)
+   assert(dim == self.dim, "Dimensionality of input and weight don't match.")
+   assert(noOfSamples >= self.k, "Need atleast k samples for initialization.")
+
+   local indices = torch.zeros(self.k)
+   indices:random(1, noOfSamples)
+
+   for i=1, self.k do
+      self.weight[i]:copy(input[indices[i]])
+   end
+end
+
+-- Initialize using Kmeans++
+function Kmeans:initKmeansPlus(input, p)
+   self.p = p or self.p or 0.95
+   assert(self.p>=0 and self.p<=1, "P value should be between 0-1.")
+
+   local inputDim = input:nDimension()
+   assert(inputDim == 2, "Incorrect input dimensionality. Expecting 2D.")
+   local noOfSamples = input:size(1)
+
+   local pcount = math.ceil((1-self.p)*noOfSamples)
+   if pcount <= 0 then pcount = 1 end
+
+   local initializedK = 1
+   self.weight[initializedK]:copy(input[torch.random(noOfSamples)])
+   initializedK = initializedK + 1
+
+   local clusters = self.weight.new()
+   local clusterDistances = self.weight.new()
+   local temp = self.weight.new()
+   local expandedSample = self.weight.new()
+   local distances = self.weight.new()
+   distances:resize(noOfSamples):fill(math.huge)
+   local maxScores = self.weight.new()
+   local maxIndx = self.weight.new()
+
+   for k=initializedK, self.k do
+      clusters = self.weight[{{initializedK-1, initializedK-1}}]
+      for i=1, noOfSamples do
+         temp:expand(input[{{i}}], 1, self.dim)
+         expandedSample:resize(temp:size()):copy(temp)
+
+         -- Squared Euclidean distance
+         expandedSample:add(-1, clusters)
+         clusterDistances:norm(expandedSample, 2, 2)
+         clusterDistances:pow(2)
+         distances[i] = math.min(clusterDistances:min(), distances[i])
+      end
+      maxScores, maxIndx = distances:sort(true)
+      local tempIndx = torch.random(pcount)
+      local indx = maxIndx[tempIndx]
+      self.weight[initializedK]:copy(input[indx])
+      initializedK = initializedK + 1
+   end
+end
+
+local function isCudaTensor(tensor)
+   local typename = torch.typename(tensor)
+   if typename and typename:find('torch.Cuda*Tensor') then
+      return true
+   end
+   return false
+end
+
+-- Kmeans updateOutput (forward)
+function Kmeans:updateOutput(input)
+   local inputDim = input:nDimension()
+   assert(inputDim == 2, "Incorrect input dimensionality. Expecting 2D.")
+
+   local batchSize = input:size(1)
+   local dim = input:size(2)
+   assert(dim == self.dim, "Dimensionality of input and weight don't match.")
+
+   assert(input:isContiguous(), "Input is not contiguous.")
+
+   -- a sample copied k times to compute distance between sample and weight
+   self._expandedSamples = self._expandedSamples or self.weight.new()
+
+   -- distance between a sample and weight
+   self._clusterDistances = self._clusterDistances or self.weight.new()
+
+   self._temp = self._temp or input.new()
+   self._tempExpanded = self._tempExpanded or input.new()
+
+   -- Expanding inputs
+   self._temp:view(input, 1, batchSize, self.dim)
+   self._tempExpanded:expand(self._temp, self.k, batchSize, self.dim)
+   self._expandedSamples:resize(self.k, batchSize, self.dim)
+                        :copy(self._tempExpanded)
+
+   -- Expanding weights
+   self._tempWeight = self._tempWeight or self.weight.new()
+   self._tempWeightExp = self._tempWeightExp or self.weight.new()
+   self._expandedWeight = self._expanedWeight or self.weight.new()
+   self._tempWeight:view(self.weight, self.k, 1, self.dim)
+   self._tempWeightExp:expand(self._tempWeight, self._expandedSamples:size())
+   self._expandedWeight:resize(self.k, batchSize, self.dim)
+                       :copy(self._tempWeightExp)
+
+   -- x-c
+   self._expandedSamples:add(-1, self._expandedWeight)
+   -- Squared Euclidean distance
+   self._clusterDistances:norm(self._expandedSamples, 2, 3)
+   self._clusterDistances:pow(2)
+   self._clusterDistances:resize(self.k, batchSize)
+
+   self._minScore = self._minScore or self.weight.new()
+   self._minIndx = self._minIndx or (isCudaTensor(input) and torch.CudaLongTensor() or torch.LongTensor())
+   self._minScore:min(self._minIndx, self._clusterDistances, 1)
+   self._minIndx:resize(batchSize)
+
+   self.output:resize(batchSize):copy(self._minIndx)
+   self.loss = self._minScore:sum()
+
+   return self.output
+end
+
+-- Kmeans has its own criterion hence gradInput are zeros
+function Kmeans:updateGradInput(input, gradOuput)
+   self.gradInput:resize(input:size()):zero()
+
+   return self.gradInput
+end
+
+-- We define kmeans update rule as c -> c + scale * 1/n * sum_i (x-c).
+-- n is no. of x's belonging to c.
+-- With this update rule and gradient descent will be negative the gradWeights.
+function Kmeans:accGradParameters(input, gradOutput, scale)
+   local scale = self.scale or scale or 1
+   assert(scale > 0 , " Scale has to be positive.")
+
+   -- Update cluster sample count
+   local batchSize = input:size(1)
+   self._cscAdder = self._cscAdder or self.weight.new()
+   self._cscAdder:resize(batchSize):fill(1)
+   self.clusterSampleCount:zero()
+   self.clusterSampleCount:indexAdd(1, self._minIndx, self._cscAdder)
+
+   -- scale * (x[k]-c[k]) where k is nearest cluster to x
+   self._gradWeight = self._gradWeight or self.gradWeight.new()
+   self._gradWeight:index(self.weight, 1, self._minIndx)
+   self._gradWeight:mul(-1)
+   self._gradWeight:add(input)
+   self._gradWeight:mul(-scale)
+
+   self._gradWeight2 = self._gradWeight2 or self.gradWeight.new()
+   self._gradWeight2:resizeAs(self.gradWeight):zero()
+   self._gradWeight2:indexAdd(1, self._minIndx, self._gradWeight)
+
+   -- scale/n * sum_i (x-c)
+   self._ccounts = self._ccounts or self.clusterSampleCount.new()
+   self._ccounts:resize(self.k):copy(self.clusterSampleCount)
+   self._ccounts:add(0.0000001) -- prevent division by zero errors
+
+   self._gradWeight2:cdiv(self._ccounts:view(self.k,1):expandAs(self.gradWeight))
+
+   self.gradWeight:add(self._gradWeight2)
+end
+
+function Kmeans:clearState()
+   -- prevent premature memory allocations
+   self._expandedSamples = nil
+   self._clusterDistances = nil
+   self._temp = nil
+   self._tempExpanded = nil
+   self._tempWeight = nil
+   self._tempWeightExp = nil
+   self._expandedWeight = nil
+   self._minScore = nil
+   self._minIndx = nil
+   self._cscAdder = nil
+end
+
+function Kmeans:type(type, tensorCache)
+   self:clearState()
+   return parent.type(self, type, tensorCache)
+end
diff --git a/contrib/lua-torch/nn/L1Cost.lua b/contrib/lua-torch/nn/L1Cost.lua
new file mode 100644
index 000000000..6b58e0ec9
--- /dev/null
+++ b/contrib/lua-torch/nn/L1Cost.lua
@@ -0,0 +1,30 @@
+local THNN = require 'nn.THNN'
+local L1Cost, parent = torch.class('nn.L1Cost','nn.Criterion')
+
+function L1Cost:__init()
+   parent.__init(self)
+end
+
+function L1Cost:updateOutput(input)
+   self.output_tensor = self.output_tensor or input.new(1)
+   input.THNN.L1Cost_updateOutput(
+      input:cdata(),
+      self.output_tensor:cdata()
+   )
+   self.output = self.output_tensor[1]
+   return self.output
+end
+
+function L1Cost:updateGradInput(input)
+   input.THNN.L1Cost_updateGradInput(
+      input:cdata(),
+      THNN.NULL,
+      self.gradInput:cdata()
+   )
+   return self.gradInput
+end
+
+function L1Cost:clearState()
+   if self.output_tensor then self.output_tensor:set() end
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/L1HingeEmbeddingCriterion.lua b/contrib/lua-torch/nn/L1HingeEmbeddingCriterion.lua
new file mode 100644
index 000000000..6957278f5
--- /dev/null
+++ b/contrib/lua-torch/nn/L1HingeEmbeddingCriterion.lua
@@ -0,0 +1,41 @@
+local L1HingeEmbeddingCriterion, parent = torch.class('nn.L1HingeEmbeddingCriterion', 'nn.Criterion')
+
+function L1HingeEmbeddingCriterion:__init(margin)
+   parent.__init(self)
+   margin = margin or 1
+   self.margin = margin
+   self.gradInput = {torch.Tensor(), torch.Tensor()}
+end
+
+function L1HingeEmbeddingCriterion:updateOutput(input,y)
+   self.output=input[1]:dist(input[2],1);
+   if y == -1 then
+	 self.output = math.max(0,self.margin - self.output);
+   end
+   return self.output
+end
+
+
+local function mathsign(t)
+   if t>0 then return 1; end
+   if t<0 then return -1; end
+   return 2*torch.random(2)-3;
+end
+
+function L1HingeEmbeddingCriterion:updateGradInput(input, y)
+  self.gradInput[1]:resizeAs(input[1])
+  self.gradInput[2]:resizeAs(input[2])
+  self.gradInput[1]:copy(input[1])
+  self.gradInput[1]:add(-1, input[2])
+  local dist = self.gradInput[1]:norm(1);
+  self.gradInput[1]:apply(mathsign)    -- L1 gradient
+  if y == -1 then -- just to avoid a mul by 1
+   if dist > self.margin then
+     self.gradInput[1]:zero()
+   else
+     self.gradInput[1]:mul(-1)
+   end
+  end
+  self.gradInput[2]:zero():add(-1, self.gradInput[1])
+  return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/L1Penalty.lua b/contrib/lua-torch/nn/L1Penalty.lua
new file mode 100644
index 000000000..9ee6b35ff
--- /dev/null
+++ b/contrib/lua-torch/nn/L1Penalty.lua
@@ -0,0 +1,42 @@
+local L1Penalty, parent = torch.class('nn.L1Penalty','nn.Module')
+
+--This module acts as an L1 latent state regularizer, adding the
+--[gradOutput] to the gradient of the L1 loss. The [input] is copied to
+--the [output].
+
+function L1Penalty:__init(l1weight, sizeAverage, provideOutput)
+    parent.__init(self)
+    self.l1weight = l1weight
+    self.sizeAverage = sizeAverage or false
+    if provideOutput == nil then
+       self.provideOutput = true
+    else
+       self.provideOutput = provideOutput
+    end
+end
+
+function L1Penalty:updateOutput(input)
+    local m = self.l1weight
+    if self.sizeAverage == true then
+      m = m/input:nElement()
+    end
+    local loss = m*input:norm(1)
+    self.loss = loss
+    self.output = input
+    return self.output
+end
+
+function L1Penalty:updateGradInput(input, gradOutput)
+    local m = self.l1weight
+    if self.sizeAverage == true then
+      m = m/input:nElement()
+    end
+
+    self.gradInput:resizeAs(input):copy(input):sign():mul(m)
+
+    if self.provideOutput == true then
+        self.gradInput:add(gradOutput)
+    end
+
+    return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/LayerNormalization.lua b/contrib/lua-torch/nn/LayerNormalization.lua
new file mode 100644
index 000000000..722d7c802
--- /dev/null
+++ b/contrib/lua-torch/nn/LayerNormalization.lua
@@ -0,0 +1,27 @@
+-- Reference: https://arxiv.org/pdf/1607.06450.pdf (Section 3)
+
+local LayerNormalization, parent = torch.class('nn.LayerNormalization', 'nn.Sequential')
+function LayerNormalization:__init(nOutput, bias, eps, affine)
+   parent.__init(self)
+   eps = eps or 1e-10
+   affine = (affine == nil) and true or affine
+   bias = bias or 0
+
+   self:add(nn.ConcatTable()
+               :add(nn.Identity())
+               :add(nn.Sequential()
+                       :add(nn.Mean(1, 1))
+                       :add(nn.Replicate(nOutput,1,1))))
+      :add(nn.CSubTable())
+      :add(nn.Normalize(2, eps))
+      :add(nn.MulConstant(torch.sqrt(nOutput)))
+
+   if affine then
+      local biasTransform = nn.Add(nOutput, false)
+      biasTransform.bias:fill(bias)
+      local gainTransform = nn.CMul(nOutput)
+      gainTransform.weight:fill(1.)
+      self:add(gainTransform)
+      self:add(biasTransform)
+   end
+end
diff --git a/contrib/lua-torch/nn/LeakyReLU.lua b/contrib/lua-torch/nn/LeakyReLU.lua
new file mode 100644
index 000000000..56b7f2542
--- /dev/null
+++ b/contrib/lua-torch/nn/LeakyReLU.lua
@@ -0,0 +1,41 @@
+local LeakyReLU, parent = torch.class('nn.LeakyReLU','nn.Module')
+
+function LeakyReLU:__init(negval,ip)
+   parent.__init(self)
+   if type(negval) == 'boolean' then
+      local ip = negval
+      self.negval = 1/100
+   else
+      self.negval = negval or (1/100)
+   end
+   -- default for inplace is false
+   self.inplace = ip or false
+   if self.negval < 0 then
+      self.inplace = false
+   end
+end
+
+function LeakyReLU:updateOutput(input)
+      input.THNN.LeakyReLU_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.negval,
+      self.inplace
+   )
+   return self.output
+end
+
+function LeakyReLU:updateGradInput(input, gradOutput)
+   input.THNN.LeakyReLU_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.negval,
+      self.inplace
+   )
+   return self.gradInput
+end
+
+function LeakyReLU:__tostring__()
+   return torch.type(self) .. string.format('(%g)', self.negval)
+end
diff --git a/contrib/lua-torch/nn/Linear.lua b/contrib/lua-torch/nn/Linear.lua
new file mode 100644
index 000000000..09b5979ce
--- /dev/null
+++ b/contrib/lua-torch/nn/Linear.lua
@@ -0,0 +1,122 @@
+local Linear, parent = torch.class('nn.Linear', 'nn.Module')
+
+function Linear:__init(inputSize, outputSize, bias)
+   parent.__init(self)
+   local bias = ((bias == nil) and true) or bias
+   self.weight = torch.Tensor(outputSize, inputSize)
+   self.gradWeight = torch.Tensor(outputSize, inputSize)
+   if bias then
+      self.bias = torch.Tensor(outputSize)
+      self.gradBias = torch.Tensor(outputSize)
+   end
+   self:reset()
+end
+
+function Linear:noBias()
+   self.bias = nil
+   self.gradBias = nil
+   return self
+end
+
+function Linear:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1./math.sqrt(self.weight:size(2))
+   end
+   if nn.oldSeed then
+      for i=1,self.weight:size(1) do
+         self.weight:select(1, i):apply(function()
+            return torch.uniform(-stdv, stdv)
+         end)
+      end
+      if self.bias then
+         for i=1,self.bias:nElement() do
+            self.bias[i] = torch.uniform(-stdv, stdv)
+         end
+      end
+   else
+      self.weight:uniform(-stdv, stdv)
+      if self.bias then self.bias:uniform(-stdv, stdv) end
+   end
+   return self
+end
+
+function Linear:updateAddBuffer(input)
+   local nframe = input:size(1)
+   self.addBuffer = self.addBuffer or input.new()
+   if self.addBuffer:nElement() ~= nframe then
+      self.addBuffer:resize(nframe):fill(1)
+   end
+end
+
+function Linear:updateOutput(input)
+   if input:dim() == 1 then
+      self.output:resize(self.weight:size(1))
+      if self.bias then self.output:copy(self.bias) else self.output:zero() end
+      self.output:addmv(1, self.weight, input)
+   elseif input:dim() == 2 then
+      local nframe = input:size(1)
+      local nElement = self.output:nElement()
+      self.output:resize(nframe, self.weight:size(1))
+      if self.output:nElement() ~= nElement then
+         self.output:zero()
+      end
+      self:updateAddBuffer(input)
+      self.output:addmm(0, self.output, 1, input, self.weight:t())
+      if self.bias then self.output:addr(1, self.addBuffer, self.bias) end
+   else
+      error('input must be vector or matrix')
+   end
+
+   return self.output
+end
+
+function Linear:updateGradInput(input, gradOutput)
+   if self.gradInput then
+
+      local nElement = self.gradInput:nElement()
+      self.gradInput:resizeAs(input)
+      if self.gradInput:nElement() ~= nElement then
+         self.gradInput:zero()
+      end
+      if input:dim() == 1 then
+         self.gradInput:addmv(0, 1, self.weight:t(), gradOutput)
+      elseif input:dim() == 2 then
+         self.gradInput:addmm(0, 1, gradOutput, self.weight)
+      end
+
+      return self.gradInput
+   end
+end
+
+function Linear:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   if input:dim() == 1 then
+      self.gradWeight:addr(scale, gradOutput, input)
+      if self.bias then self.gradBias:add(scale, gradOutput) end
+   elseif input:dim() == 2 then
+      self.gradWeight:addmm(scale, gradOutput:t(), input)
+      if self.bias then
+         -- update the size of addBuffer if the input is not the same size as the one we had in last updateGradInput
+         self:updateAddBuffer(input)
+         self.gradBias:addmv(scale, gradOutput:t(), self.addBuffer)
+      end
+   end
+end
+
+function Linear:sharedAccUpdateGradParameters(input, gradOutput, lr)
+   -- we do not need to accumulate parameters when sharing:
+   self:defaultAccUpdateGradParameters(input, gradOutput, lr)
+end
+
+function Linear:clearState()
+   if self.addBuffer then self.addBuffer:set() end
+   return parent.clearState(self)
+end
+
+function Linear:__tostring__()
+  return torch.type(self) ..
+      string.format('(%d -> %d)', self.weight:size(2), self.weight:size(1)) ..
+      (self.bias == nil and ' without bias' or '')
+end
diff --git a/contrib/lua-torch/nn/LinearWeightNorm.lua b/contrib/lua-torch/nn/LinearWeightNorm.lua
new file mode 100755
index 000000000..a712f5535
--- /dev/null
+++ b/contrib/lua-torch/nn/LinearWeightNorm.lua
@@ -0,0 +1,168 @@
+local LinearWeightNorm, parent = torch.class('nn.LinearWeightNorm', 'nn.Linear')
+
+function LinearWeightNorm:__init(inputSize, outputSize, bias, eps)
+    nn.Module.__init(self) -- Skip nn.Linear constructor
+
+    local bias = ((bias == nil) and true) or bias
+
+    self.eps = eps or 1e-16
+
+    self.outputSize = outputSize
+    self.inputSize = inputSize
+
+    self.v = torch.Tensor(outputSize, inputSize)
+    self.gradV = torch.Tensor(outputSize, inputSize)
+
+    self.weight = torch.Tensor(outputSize, inputSize)
+
+    self.g = torch.Tensor(outputSize,1)
+    self.gradG = torch.Tensor(outputSize,1)
+
+    self.norm = torch.Tensor(outputSize,1)
+    self.scale = torch.Tensor(outputSize,1)
+
+    if bias then
+        self.bias = torch.Tensor(outputSize)
+        self.gradBias = torch.Tensor(outputSize)
+    end
+
+    self:reset()
+end
+
+function LinearWeightNorm:evaluate()
+    if self.train ~= false then
+        self:updateWeightMatrix()
+    end
+
+    parent.evaluate(self)
+end
+
+function LinearWeightNorm:initFromWeight(weight)
+    weight = weight or self.weight
+
+    self.g:norm(weight,2,2):clamp(self.eps,math.huge)
+    self.v:copy(weight)
+
+    return self
+end
+
+function LinearWeightNorm.fromLinear(linear)
+    local module = nn.LinearWeightNorm(linear.weight:size(2), linear.weight:size(1), torch.isTensor(linear.bias))
+    module.weight:copy(linear.weight)
+    module:initFromWeight()
+
+    if linear.bias then
+        module.bias:copy(linear.bias)
+    end
+
+    return module
+end
+
+function LinearWeightNorm:toLinear()
+    self:updateWeightMatrix()
+
+    local module = nn.Linear(self.inputSize, self.outputSize, torch.isTensor(self.bias))
+
+    module.weight:copy(self.weight)
+    if self.bias then
+        module.bias:copy(self.bias)
+    end
+
+    return module
+end
+
+function LinearWeightNorm:parameters()
+    if self.bias then
+        return {self.v, self.g, self.bias}, {self.gradV, self.gradG, self.gradBias}
+    else
+        return {self.v, self.g}, {self.gradV, self.gradG}
+    end
+end
+
+function LinearWeightNorm:reset(stdv)
+    if stdv then
+        stdv = stdv * math.sqrt(3)
+    else
+        stdv = 1 / math.sqrt(self.inputSize)
+    end
+
+    self.weight:uniform(-stdv,stdv)
+    self:initFromWeight()
+
+    if self.bias then
+        self.bias:uniform(-stdv,stdv)
+    end
+end
+
+function LinearWeightNorm:updateWeightMatrix()
+    if self.norm:dim() == 0 then self.norm:resizeAs(self.g) end
+    if self.scale:dim() == 0 then self.scale:resizeAs(self.g) end
+    if self.weight:dim() == 0 then self.weight:resizeAs(self.v) end
+
+    self.norm:norm(self.v,2,2):clamp(self.eps,math.huge)
+    self.scale:cdiv(self.g,self.norm)
+    self.weight:cmul(self.v,self.scale:expandAs(self.v))
+end
+
+function LinearWeightNorm:updateOutput(input)
+    if self.train ~= false then
+        self:updateWeightMatrix()
+    end
+
+    return parent.updateOutput(self, input)
+end
+
+function LinearWeightNorm:accGradParameters(input, gradOutput, scale)
+    scale = scale or 1
+    if input:dim() == 1 then
+        self.gradV:addr(scale, gradOutput, input)
+        if self.bias then self.gradBias:add(scale, gradOutput) end
+    elseif input:dim() == 2 then
+        self.gradV:addmm(scale, gradOutput:t(), input)
+        if self.bias then
+            -- update the size of addBuffer if the input is not the same size as the one we had in last updateGradInput
+            self:updateAddBuffer(input)
+            self.gradBias:addmv(scale, gradOutput:t(), self.addBuffer)
+        end
+    end
+
+    local scale = self.scale:expandAs(self.v)
+    local norm = self.norm:expandAs(self.v)
+
+    self.weight:cmul(self.gradV,self.v):cdiv(norm)
+    self.gradG:sum(self.weight,2)
+
+    self.gradV:cmul(scale)
+
+    self.weight:cmul(self.v,scale):cdiv(norm)
+    self.weight:cmul(self.gradG:expandAs(self.weight))
+
+    self.gradV:add(-1,self.weight)
+end
+
+function LinearWeightNorm:defaultAccUpdateGradParameters(input, gradOutput, lr)
+    local gradV = self.gradV
+    local gradG = self.gradG
+    local gradBias = self.gradBias
+
+    self.gradV = self.v
+    self.gradG = self.g
+    self.gradBias = self.bias
+
+    self:accGradParameters(input, gradOutput, -lr)
+
+    self.gradV = gradV
+    self.gradG = gradG
+    self.gradBias = gradBias
+end
+
+function LinearWeightNorm:clearState()
+    nn.utils.clear(self, 'weight', 'norm', 'scale')
+    return parent.clearState(self)
+end
+
+function LinearWeightNorm:__tostring__()
+    return torch.type(self) ..
+        string.format('(%d -> %d)', self.inputSize, self.outputSize) ..
+        (self.bias == nil and ' without bias' or '')
+end
+\ No newline at end of file
diff --git a/contrib/lua-torch/nn/Log.lua b/contrib/lua-torch/nn/Log.lua
new file mode 100644
index 000000000..e8f236bfb
--- /dev/null
+++ b/contrib/lua-torch/nn/Log.lua
@@ -0,0 +1,20 @@
+local Log, parent = torch.class('nn.Log', 'nn.Module')
+
+function Log:__init()
+   parent.__init(self)
+end
+
+function Log:updateOutput(input)
+   self.output:resizeAs(input)
+   self.output:copy(input)
+   self.output:log()
+   return self.output
+end
+
+function Log:updateGradInput(input, gradOutput)
+   self.gradInput:resizeAs(input)
+   self.gradInput:fill(1)
+   self.gradInput:cdiv(input)
+   self.gradInput:cmul(gradOutput)
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/LogSigmoid.lua b/contrib/lua-torch/nn/LogSigmoid.lua
new file mode 100644
index 000000000..cab848f4d
--- /dev/null
+++ b/contrib/lua-torch/nn/LogSigmoid.lua
@@ -0,0 +1,27 @@
+local LogSigmoid, parent = torch.class('nn.LogSigmoid', 'nn.Module')
+
+function LogSigmoid:updateOutput(input)
+   self.buffer = self.buffer or input.new()
+   input.THNN.LogSigmoid_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.buffer:cdata()
+   )
+   return self.output
+end
+
+function LogSigmoid:updateGradInput(input, gradOutput)
+   input.THNN.LogSigmoid_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.buffer:cdata()
+   )
+   return self.gradInput
+end
+
+function LogSigmoid:clearState()
+   if self.buffer then self.buffer:set() end
+   return parent.clearState(self)
+end
+
diff --git a/contrib/lua-torch/nn/LogSoftMax.lua b/contrib/lua-torch/nn/LogSoftMax.lua
new file mode 100644
index 000000000..37c8acae4
--- /dev/null
+++ b/contrib/lua-torch/nn/LogSoftMax.lua
@@ -0,0 +1,19 @@
+local LogSoftMax = torch.class('nn.LogSoftMax', 'nn.Module')
+
+function LogSoftMax:updateOutput(input)
+   input.THNN.LogSoftMax_updateOutput(
+      input:cdata(),
+      self.output:cdata()
+   )
+   return self.output
+end
+
+function LogSoftMax:updateGradInput(input, gradOutput)
+   input.THNN.LogSoftMax_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.output:cdata()
+   )
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/LookupTable.lua b/contrib/lua-torch/nn/LookupTable.lua
new file mode 100644
index 000000000..6cffc6c3e
--- /dev/null
+++ b/contrib/lua-torch/nn/LookupTable.lua
@@ -0,0 +1,166 @@
+local THNN = require 'nn.THNN'
+local LookupTable, parent = torch.class('nn.LookupTable', 'nn.Module')
+
+LookupTable.__version = 4
+
+function LookupTable:__init(nIndex, nOutput, paddingValue, maxNorm, normType)
+   parent.__init(self)
+
+   self.weight = torch.Tensor(nIndex, nOutput)
+   self.gradWeight = torch.Tensor(nIndex, nOutput):zero()
+   self.paddingValue = paddingValue or 0
+   self.maxNorm = maxNorm or nil
+   self.normType = normType or nil
+
+   self:reset()
+end
+
+function LookupTable:backCompatibility()
+   self._count = self._count or torch.IntTensor()
+   self._input = self._input or torch.LongTensor()
+
+   if not self.shouldScaleGradByFreq then
+      self.shouldScaleGradByFreq = false
+   end
+end
+
+function LookupTable:accUpdateOnly()
+   self.gradWeight = nil
+   return self
+end
+
+function LookupTable:setPadding(paddingValue)
+   self.paddingValue = paddingValue
+   return self
+end
+
+function LookupTable:setMaxNorm(maxNorm)
+   self.maxNorm = maxNorm
+   return self
+end
+
+function LookupTable:setNormType(normType)
+   self.normType = normType
+   return self
+end
+
+function LookupTable:scaleGradByFreq()
+   self.shouldScaleGradByFreq = true
+   return self
+end
+
+function LookupTable:reset(stdv)
+   stdv = stdv or 1
+   self.weight:normal(0, stdv)
+end
+
+function LookupTable:makeInputContiguous(input)
+   -- make sure input is a contiguous torch.LongTensor
+   if (not input:isContiguous()) or torch.type(input) ~= torch.type(self._input) then
+      self.copiedInput = true
+      self._input:resize(input:size()):copy(input)
+      return self._input
+   end
+   self.copiedInput = false
+   return input
+end
+
+function LookupTable:updateOutput(input)
+   self:backCompatibility()
+   self:renorm(input)
+   input = self:makeInputContiguous(input)
+   if input:dim() == 1 then
+      self.output:index(self.weight, 1, input)
+   elseif input:dim() == 2 then
+      self.output:index(self.weight, 1, input:view(-1))
+      self.output = self.output:view(input:size(1), input:size(2), self.weight:size(2))
+   else
+      error("input must be a vector or matrix")
+   end
+   return self.output
+end
+
+function LookupTable:updateGradInput(input, gradOutput)
+   -- the input can be of any type (as in the forward it's
+   -- converted anyway to LongTensor) thus, need to allocate
+   -- new memory each time the user changes the input type
+   if torch.type(self.gradInput) ~= torch.type(input) then
+      self.gradInput = input.new()
+   end
+   if not self.gradInput:isSameSizeAs(input) then
+      self.gradInput:resizeAs(input):zero()
+   end
+   return self.gradInput
+end
+
+function LookupTable:accGradParameters(input, gradOutput, scale)
+   self:backCompatibility()
+   input = self.copiedInput and self._input or input
+   if input:dim() == 2 then
+      input = input:view(-1)
+   elseif input:dim() ~= 1 then
+      error("input must be a vector or matrix")
+   end
+
+   self.gradWeight.THNN.LookupTable_accGradParameters(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradWeight:cdata(),
+      self._count:cdata(),
+      THNN.optionalTensor(self._sorted),
+      THNN.optionalTensor(self._indices),
+      self.shouldScaleGradByFreq or false,
+      self.paddingValue or 0,
+      scale or 1
+   )
+end
+
+function LookupTable:renorm(input)
+   if not self.maxNorm then
+      return
+   end
+   -- copy input into _input, so _input is continuous.
+   -- The copied _input will be modified in the C code.
+   self._input:resize(input:size()):copy(input)
+   local row_idx = self._input
+   if row_idx:dim() == 2 then
+      row_idx = row_idx:view(-1)
+   elseif row_idx:dim() ~= 1 then
+      error("input must be a vector or matrix")
+   end
+   -- "row_idx" and "weight" will be modified in the C code
+   self.weight.THNN.LookupTable_renorm(
+      row_idx:cdata(),
+      self.weight:cdata(),
+      self.maxNorm,
+      self.normType or 2
+   )
+end
+
+function LookupTable:type(type, tensorCache)
+   parent.type(self, type, tensorCache)
+
+   if type and type:find('torch%.Cuda.*Tensor') then
+      -- CUDA uses _sorted and _indices temporary tensors
+      self._sorted = torch.CudaLongTensor and torch.CudaLongTensor.new() or torch.CudaTensor.new()
+      self._indices = torch.CudaLongTensor and torch.CudaLongTensor.new() or torch.CudaTensor.new()
+      self._count = torch.CudaLongTensor and torch.CudaLongTensor.new() or torch.CudaTensor.new()
+      self._input = torch.CudaLongTensor and torch.CudaLongTensor.new() or torch.CudaTensor.new()
+   else
+      -- self._count and self._input should only be converted if using Cuda
+      self._count = torch.IntTensor()
+      self._input = torch.LongTensor()
+   end
+
+   return self
+end
+
+function LookupTable:clearState()
+   nn.utils.clear(self, '_count', '_input')
+   return parent.clearState(self)
+end
+
+function LookupTable:sharedAccUpdateGradParameters(input, gradOutput, lr)
+   -- we do not need to accumulate parameters when sharing:
+   self:defaultAccUpdateGradParameters(input, gradOutput, lr)
+end
diff --git a/contrib/lua-torch/nn/MM.lua b/contrib/lua-torch/nn/MM.lua
new file mode 100644
index 000000000..cc978c8cb
--- /dev/null
+++ b/contrib/lua-torch/nn/MM.lua
@@ -0,0 +1,92 @@
+--[[ Module to perform matrix multiplication on two minibatch inputs,
+     producing a minibatch.
+]]
+
+local MM, parent = torch.class('nn.MM', 'nn.Module')
+
+--[[ The constructor takes two optional arguments, specifying whether or not transpose
+     any of the input matrices before perfoming the multiplication.
+]]
+function MM:__init(transA, transB)
+  parent.__init(self)
+
+  self.transA = transA or false
+  self.transB = transB or false
+
+  self.gradInput = {torch.Tensor(), torch.Tensor()}
+end
+
+function MM:updateOutput(input)
+  assert(#input == 2, 'input must be a pair of minibatch matrices')
+  local a, b = table.unpack(input)
+  assert(a:nDimension() == 2 or a:nDimension() == 3, 'input tensors must be 2D or 3D')
+
+  if a:nDimension() == 2 then
+    assert(b:nDimension() == 2, 'second input tensor must be 2D')
+
+    if self.transA then a = a:t() end
+    if self.transB then b = b:t() end
+    assert(a:size(2) == b:size(1), 'matrix sizes do not match')
+
+    self.output:resize(a:size(1), b:size(2))
+    self.output:mm(a, b)
+  else
+    assert(b:nDimension() == 3, 'second input tensor must be 3D')
+    assert(a:size(1) == b:size(1), 'inputs must contain the same number of minibatches')
+
+    if self.transA then a = a:transpose(2, 3) end
+    if self.transB then b = b:transpose(2, 3) end
+    assert(a:size(3) == b:size(2), 'matrix sizes do not match')
+
+    self.output:resize(a:size(1), a:size(2), b:size(3))
+    self.output:bmm(a, b)
+  end
+
+  return self.output
+end
+
+function MM:updateGradInput(input, gradOutput)
+  self.gradInput[1] = self.gradInput[1] or input[1].new()
+  self.gradInput[2] = self.gradInput[2] or input[2].new()
+
+  assert(#input == 2, 'input must be a pair of tensors')
+  local a, b = table.unpack(input)
+  self.gradInput[1]:resizeAs(a)
+  self.gradInput[2]:resizeAs(b)
+
+  assert(gradOutput:nDimension() == 2 or gradOutput:nDimension() == 3, 'arguments must be a 2D or 3D Tensor')
+
+  local h_dim, w_dim, f
+  if gradOutput:nDimension() == 2 then
+    assert(a:nDimension() == 2, 'first input tensor must be 2D')
+    assert(b:nDimension() == 2, 'second input tensor must be 2D')
+
+    h_dim, w_dim = 1, 2
+    f = "mm"
+  else
+    assert(a:nDimension() == 3, 'first input tensor must be 3D')
+    assert(b:nDimension() == 3, 'second input tensor must be 3D')
+
+    h_dim, w_dim = 2, 3
+    f = "bmm"
+  end
+
+  if self.transA == self.transB then
+    a = a:transpose(h_dim, w_dim)
+    b = b:transpose(h_dim, w_dim)
+  end
+
+  if self.transA then
+    self.gradInput[1][f](self.gradInput[1], b, gradOutput:transpose(h_dim, w_dim))
+  else
+    self.gradInput[1][f](self.gradInput[1], gradOutput, b)
+  end
+
+  if self.transB then
+    self.gradInput[2][f](self.gradInput[2], gradOutput:transpose(h_dim, w_dim), a)
+  else
+    self.gradInput[2][f](self.gradInput[2], a, gradOutput)
+  end
+
+  return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/MSECriterion.lua b/contrib/lua-torch/nn/MSECriterion.lua
new file mode 100644
index 000000000..d38beb6bf
--- /dev/null
+++ b/contrib/lua-torch/nn/MSECriterion.lua
@@ -0,0 +1,32 @@
+local MSECriterion, parent = torch.class('nn.MSECriterion', 'nn.Criterion')
+
+function MSECriterion:__init(sizeAverage)
+   parent.__init(self)
+   if sizeAverage ~= nil then
+     self.sizeAverage = sizeAverage
+   else
+     self.sizeAverage = true
+   end
+end
+
+function MSECriterion:updateOutput(input, target)
+   self.output_tensor = self.output_tensor or input.new(1)
+   input.THNN.MSECriterion_updateOutput(
+      input:cdata(),
+      target:cdata(),
+      self.output_tensor:cdata(),
+      self.sizeAverage
+   )
+   self.output = self.output_tensor[1]
+   return self.output
+end
+
+function MSECriterion:updateGradInput(input, target)
+   input.THNN.MSECriterion_updateGradInput(
+      input:cdata(),
+      target:cdata(),
+      self.gradInput:cdata(),
+      self.sizeAverage
+   )
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/MV.lua b/contrib/lua-torch/nn/MV.lua
new file mode 100644
index 000000000..a00478ef6
--- /dev/null
+++ b/contrib/lua-torch/nn/MV.lua
@@ -0,0 +1,82 @@
+--[[ Module to perform matrix vector multiplication on two minibatch inputs,
+producing a minibatch.
+]]
+
+local MV, parent = torch.class('nn.MV', 'nn.Module')
+
+-- Backward compatibility
+local unpack = unpack or table.unpack
+
+function MV:__init(trans)
+  parent.__init(self)
+
+  self.trans = trans or false
+  assert(type(self.trans) == 'boolean', "argument must be a boolean, matrix transpose before multiplication")
+
+  self.gradInput = {torch.Tensor(), torch.Tensor()}
+end
+
+function MV:updateOutput(input)
+  assert(#input == 2, 'input must be a pair of minibatch matrices')
+  local M, v = unpack(input)
+  assert(M:nDimension() == 2 or M:nDimension() == 3, 'input matrix must be 2D or 3D')
+  assert(v:nDimension() == 1 or v:nDimension() == 2, 'input vector must be 1D or 2D')
+
+  if M:nDimension() == 2 then
+    assert(v:nDimension() == 1, 'vector must be 1D')
+
+    if self.trans then M = M:transpose(1,2) end
+    assert(M:size(2) == v:size(1), 'matrix row count and vector length do not match')
+
+    self.output:resize(M:size(1))
+    self.output:mv(M, v)
+  else
+    assert(v:nDimension() == 2, 'vector must be 2D (batch dimension)')
+    assert(M:size(1) == v:size(1), 'inputs must contain the same number of minibatches')
+
+    if self.trans then M = M:transpose(2,3) end
+    assert(M:size(3) == v:size(2), 'matrix row count and vector length do not match')
+
+    self.output:resize(M:size(1), M:size(2), 1)
+    self.output:bmm(M, v:view(v:size(1), v:size(2), 1)):resize(M:size(1), M:size(2))
+  end
+
+  return self.output
+end
+
+function MV:updateGradInput(input, gradOutput)
+  assert(#input == 2, 'input must be a pair of tensors')
+  local M, v = unpack(input)
+  self.gradInput[1]:resizeAs(M)
+  self.gradInput[2]:resizeAs(v)
+
+  assert(gradOutput:nDimension() == 1 or gradOutput:nDimension() == 2, 'arguments must be a 1D or 2D Tensor')
+
+  if gradOutput:nDimension() == 2 then
+    assert(M:nDimension() == 3, 'matrix must must be 3D (batched)')
+    assert(v:nDimension() == 2, 'vector must be 2D (batched)')
+    local bdim = M:size(1)
+    local odim = M:size(2)
+    local idim = M:size(3)
+
+    if self.trans then
+      self.gradInput[1]:bmm(v:view(bdim, odim, 1), gradOutput:view(bdim, 1, idim))
+      self.gradInput[2]:view(bdim, odim, 1):bmm(M, gradOutput:view(bdim, idim, 1))
+    else
+      self.gradInput[1]:bmm(gradOutput:view(bdim, odim, 1), v:view(bdim, 1, idim))
+      self.gradInput[2]:view(bdim, idim, 1):bmm(M:transpose(2,3), gradOutput:view(bdim, odim, 1))
+    end
+  else
+    assert(M:nDimension() == 2, 'matrix must be 2D')
+    assert(v:nDimension() == 1, 'vector must be 1D')
+
+    if self.trans then
+      self.gradInput[1]:ger(v, gradOutput)
+      self.gradInput[2] = M * gradOutput
+    else
+      self.gradInput[1]:ger(gradOutput, v)
+      self.gradInput[2] = M:t() * gradOutput
+    end
+  end
+  return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/MapTable.lua b/contrib/lua-torch/nn/MapTable.lua
new file mode 100644
index 000000000..c79f1ea1d
--- /dev/null
+++ b/contrib/lua-torch/nn/MapTable.lua
@@ -0,0 +1,119 @@
+local MapTable, parent = torch.class('nn.MapTable', 'nn.Container')
+
+function MapTable:__init(module, shared)
+   parent.__init(self)
+   self.shared = (shared == nil) and true or shared
+   self.sharedparams = {'weight', 'bias', 'gradWeight', 'gradBias'}
+   self.output = {}
+   self.gradInput = {}
+   self:add(module)
+end
+
+function MapTable:_extend(n)
+   self.sharedparams = self.sharedparams or {'weight', 'bias', 'gradWeight', 'gradBias'}
+   self.modules[1] = self.module
+   for i = 2, n do
+      if not self.modules[i] then
+         if self.shared then
+           self.modules[i] = self.module:clone(table.unpack(self.sharedparams))
+         else
+           self.modules[i] = self.module:clone()
+         end
+      end
+   end
+end
+
+function MapTable:resize(n)
+   self:_extend(n)
+   for i = n + 1, #self.modules do
+      -- It's not clear why this clearState call is necessary, but it fixes
+      -- https://github.com/torch/nn/issues/1141 .
+      self.modules[i]:clearState()
+      self.modules[i] = nil
+   end
+end
+
+function MapTable:add(module)
+   assert(not self.module, 'Single module required')
+   self.module = module
+   self.modules[1] = self.module
+   return self
+end
+
+function MapTable:updateOutput(input)
+   self.output = {}
+   self:_extend(#input)
+   for i = 1, #input do
+      self.output[i] = self:rethrowErrors(self.modules[i], i, 'updateOutput', input[i])
+   end
+   return self.output
+end
+
+function MapTable:updateGradInput(input, gradOutput)
+   self.gradInput = {}
+   self:_extend(#input)
+   for i = 1, #input do
+      self.gradInput[i] = self:rethrowErrors(self.modules[i], i, 'updateGradInput', input[i], gradOutput[i])
+   end
+   return self.gradInput
+end
+
+function MapTable:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   self:_extend(#input)
+   for i = 1, #input do
+      self:rethrowErrors(self.modules[i], i, 'accGradParameters', input[i], gradOutput[i], scale)
+   end
+end
+
+function MapTable:accUpdateGradParameters(input, gradOutput, lr)
+   lr = lr or 1
+   self:_extend(#input)
+   for i = 1, #input do
+      self:rethrowErrors(self.modules[i], i, 'accUpdateGradParameters', input[i], gradOutput[i], lr)
+   end
+end
+
+function MapTable:zeroGradParameters()
+    if self.module then
+        if self.shared then
+          self.module:zeroGradParameters()
+        else
+          parent.zeroGradParameters(self)
+        end
+    end
+end
+
+function MapTable:updateParameters(learningRate)
+    if self.module then
+        if self.shared then
+          self.module:updateParameters(learningRate)
+        else
+          parent.updateParameters(self, learningRate)
+        end
+    end
+end
+
+function MapTable:clearState()
+   for i = 2, #self.modules do
+      -- It's not clear why this clearState call is necessary, but it fixes
+      -- https://github.com/torch/nn/issues/1141 .
+      self.modules[i]:clearState()
+      self.modules[i] = nil
+   end
+   parent.clearState(self)
+end
+
+function MapTable:__tostring__()
+   local tab = '  '
+   local line = '\n'
+   local extlast = '      '
+   local str = torch.type(self)
+   if self.module then
+      str = str .. ' {' .. line .. tab
+      str = str .. tostring(self.module):gsub(line, line .. tab .. extlast) .. line .. '}'
+   else
+      str = str .. ' { }'
+   end
+   return str
+end
diff --git a/contrib/lua-torch/nn/MarginCriterion.lua b/contrib/lua-torch/nn/MarginCriterion.lua
new file mode 100644
index 000000000..1ab8ad784
--- /dev/null
+++ b/contrib/lua-torch/nn/MarginCriterion.lua
@@ -0,0 +1,31 @@
+local MarginCriterion, parent = torch.class('nn.MarginCriterion', 'nn.Criterion')
+
+function MarginCriterion:__init(margin)
+   parent.__init(self)
+   self.sizeAverage = true
+   self.margin = margin or 1
+end
+
+function MarginCriterion:updateOutput(input, target)
+   self.output_tensor = self.output_tensor or input.new(1)
+   input.THNN.MarginCriterion_updateOutput(
+      input:cdata(),
+      target:cdata(),
+      self.output_tensor:cdata(),
+      self.sizeAverage,
+      self.margin
+   )
+   self.output = self.output_tensor[1]
+   return self.output
+end
+
+function MarginCriterion:updateGradInput(input, target)
+   input.THNN.MarginCriterion_updateGradInput(
+      input:cdata(),
+      target:cdata(),
+      self.gradInput:cdata(),
+      self.sizeAverage,
+      self.margin
+   )
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/MarginRankingCriterion.lua b/contrib/lua-torch/nn/MarginRankingCriterion.lua
new file mode 100644
index 000000000..844d905d5
--- /dev/null
+++ b/contrib/lua-torch/nn/MarginRankingCriterion.lua
@@ -0,0 +1,75 @@
+local MarginRankingCriterion, parent = torch.class('nn.MarginRankingCriterion', 'nn.Criterion')
+
+function MarginRankingCriterion:__init(margin)
+   parent.__init(self)
+   margin=margin or 1
+   self.margin = margin
+   self.gradInput = {torch.Tensor(1), torch.Tensor(1)}
+   self.sizeAverage = true
+end
+
+function MarginRankingCriterion:updateOutput(input, y)
+    if torch.type(y) == 'number' then -- non-batch mode
+      self.output = math.max(0, -y * (input[1][1] - input[2][1]) + self.margin)
+   else
+      self._output = self._output or input[1]:clone()
+      self._output:resizeAs(input[1])
+      self._output:copy(input[1])
+
+      self._output:add(-1, input[2])
+      self._output:mul(-1):cmul(y)
+      self._output:add(self.margin)
+
+      self._output:cmax(0)
+
+      self.output = self._output:sum()
+
+      if self.sizeAverage then
+         self.output = self.output/y:size(1)
+      end
+   end
+
+   return self.output
+end
+
+function MarginRankingCriterion:updateGradInput(input, y)
+    if torch.type(y) == 'number' then -- non-batch mode
+      local dist = -y * (input[1][1] - input[2][1]) + self.margin
+      if dist < 0 then
+         self.gradInput[1][1] = 0;
+         self.gradInput[2][1] = 0;
+      else
+         self.gradInput[1][1] = -y
+         self.gradInput[2][1] = y
+      end
+   else
+      self.dist = self.dist or input[1].new()
+      self.dist = self.dist:resizeAs(input[1]):copy(input[1])
+      local dist = self.dist
+
+      dist:add(-1, input[2])
+      dist:mul(-1):cmul(y)
+      dist:add(self.margin)
+
+      self.mask = self.mask or input[1].new()
+      self.mask = self.mask:resizeAs(input[1]):copy(dist)
+      local mask = self.mask
+
+      mask:ge(dist, 0)
+
+      self.gradInput[1]:resize(dist:size())
+      self.gradInput[2]:resize(dist:size())
+
+      self.gradInput[1]:copy(mask)
+      self.gradInput[1]:mul(-1):cmul(y)
+      self.gradInput[2]:copy(mask)
+      self.gradInput[2]:cmul(y)
+
+      if self.sizeAverage then
+         self.gradInput[1]:div(y:size(1))
+         self.gradInput[2]:div(y:size(1))
+      end
+
+   end
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/MaskedSelect.lua b/contrib/lua-torch/nn/MaskedSelect.lua
new file mode 100644
index 000000000..c3f7834e1
--- /dev/null
+++ b/contrib/lua-torch/nn/MaskedSelect.lua
@@ -0,0 +1,71 @@
+local unpack = unpack or table.unpack
+
+local MaskedSelect, parent = torch.class('nn.MaskedSelect', 'nn.Module')
+
+--[[ Sets the provided mask value for the module. ]]
+function MaskedSelect:__init()
+  parent.__init(self)
+  self._maskIndices = torch.LongTensor()
+  self._maskIndexBuffer = torch.LongTensor()
+  self._maskIndexBufferCPU = torch.FloatTensor()
+  self._gradBuffer = torch.Tensor()
+  self._gradMask = torch.ByteTensor()
+end
+
+--[[ Performs maskedSelect operation. ]]
+function MaskedSelect:updateOutput(input)
+  local input, mask = unpack(input)
+  self.output:maskedSelect(input, mask)
+  return self.output
+end
+
+--[[ Reverse maps unmasked gradOutput back to gradInput. ]]
+function MaskedSelect:updateGradInput(input, gradOutput)
+  local input, mask = unpack(input)
+  if input:type() == 'torch.CudaTensor' then
+    self._maskIndexBufferCPU:range(1, mask:nElement()):resize(mask:size())
+    self._maskIndexBuffer:resize(
+      self._maskIndexBufferCPU:size()):copy(self._maskIndexBufferCPU)
+  else
+    self._maskIndexBuffer:range(1, mask:nElement()):resize(mask:size())
+  end
+  self._maskIndices:maskedSelect(self._maskIndexBuffer, mask)
+  self._gradBuffer:resize(input:nElement()):zero()
+  self._gradBuffer:scatter(1, self._maskIndices, gradOutput)
+  self._gradBuffer:resize(input:size())
+  self.gradInput = {self._gradBuffer,
+                    self._gradMask:resize(mask:size()):fill(0)}
+  return self.gradInput
+end
+
+function MaskedSelect:type(type, tensorCache)
+  if not type then
+    return self._type
+  end
+  self._gradBuffer = self._gradBuffer:type(type)
+  self.gradInput = self.gradInput:type(type)
+  self.output = self.output:type(type)
+
+  -- These casts apply when switching between cuda/non-cuda types
+  if type ~= 'torch.CudaTensor' then
+    self._maskIndexBuffer = self._maskIndexBuffer:long()
+    self._maskIndices = self._maskIndices:long()
+    self._gradMask = self._gradMask:byte()
+  elseif  type == 'torch.CudaTensor' then
+    self._maskIndexBuffer = self._maskIndexBuffer:cuda()
+    self._maskIndices = self._maskIndices:cuda()
+    self._gradMask = self._gradMask:cuda()
+  end
+  self._type = type
+  return self
+end
+
+function MaskedSelect:clearState()
+  return nn.utils.clear(self, {'output',
+                               'gradInput',
+                               '_maskIndexBuffer',
+                               '_maskIndexBufferCPU',
+                               '_maskIndices',
+                               '_gradBuffer',
+                               '_gradMask'})
+end
diff --git a/contrib/lua-torch/nn/Max.lua b/contrib/lua-torch/nn/Max.lua
new file mode 100644
index 000000000..8273e808c
--- /dev/null
+++ b/contrib/lua-torch/nn/Max.lua
@@ -0,0 +1,66 @@
+local Max, parent = torch.class('nn.Max', 'nn.Module')
+
+function Max:__init(dimension, nInputDims)
+   parent.__init(self)
+   dimension = dimension or 1
+   self.dimension = dimension
+   -- do not assign default value to nInputDims or it will break backward compatibility
+   self.nInputDims = nInputDims
+end
+
+function Max:_getPositiveDimension(input)
+   local dimension = self.dimension
+   if dimension < 0 then
+      dimension = input:dim() + dimension + 1
+   elseif self.nInputDims and input:dim()==(self.nInputDims+1) then
+      dimension = dimension + 1
+   end
+   return dimension
+end
+
+function Max:_lazyInit()
+   self._output = self._output or self.output.new()
+   if not self._indices then
+      if torch.typename(self.output):find('torch%.Cuda.*Tensor') then
+         self._indices = torch.CudaLongTensor and torch.CudaLongTensor() or torch.CudaTensor()
+      else
+         self._indices = torch.LongTensor()
+      end
+   end
+end
+
+function Max:updateOutput(input)
+   self:_lazyInit()
+   local dimension = self:_getPositiveDimension(input)
+   torch.max(self._output, self._indices, input, dimension)
+   if input:dim() > 1 then
+     self.output:set(self._output:select(dimension, 1))
+   else
+     self.output:set(self._output)
+   end
+   return self.output
+end
+
+function Max:updateGradInput(input, gradOutput)
+   self:_lazyInit()
+   local dimension = self:_getPositiveDimension(input)
+   local gradOutputView
+   if input:dim() > 1 then
+     gradOutputView = nn.utils.addSingletonDimension(gradOutput, dimension)
+   else
+     gradOutputView = gradOutput
+   end
+   self.gradInput:resizeAs(input):zero():scatter(dimension, self._indices, gradOutputView)
+   return self.gradInput
+end
+
+function Max:type(type, tensorCache)
+    self._indices = nil
+    parent.type(self, type, tensorCache)
+    return self
+end
+
+function Max:clearState()
+   nn.utils.clear(self, '_indices', '_output')
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/Maxout.lua b/contrib/lua-torch/nn/Maxout.lua
new file mode 100644
index 000000000..a797a9f43
--- /dev/null
+++ b/contrib/lua-torch/nn/Maxout.lua
@@ -0,0 +1,13 @@
+-- Reference: http://jmlr.org/proceedings/papers/v28/goodfellow13.pdf
+
+local Maxout, parent = torch.class('nn.Maxout', 'nn.Sequential')
+
+function Maxout:__init(inputSize, outputSize, maxoutNumber, preprocess)
+   parent.__init(self)
+   self:add(nn.Linear(inputSize, outputSize * maxoutNumber))
+   self:add(nn.View(maxoutNumber, outputSize):setNumInputDims(1))
+   if preprocess then
+      self:add(preprocess)
+   end
+   self:add(nn.Max(1, 2))
+end
diff --git a/contrib/lua-torch/nn/Mean.lua b/contrib/lua-torch/nn/Mean.lua
new file mode 100644
index 000000000..8087ac95e
--- /dev/null
+++ b/contrib/lua-torch/nn/Mean.lua
@@ -0,0 +1,14 @@
+local Mean, parent = torch.class('nn.Mean', 'nn.Sum')
+
+--[[
+
+This file is still here because of backward compatibility.
+
+Please use instead "nn.Sum(dimension, nInputDims, sizeAverage)"
+
+]]--
+
+
+function Mean:__init(dimension, nInputDims)
+   parent.__init(self, dimension, nInputDims, true)
+end
diff --git a/contrib/lua-torch/nn/Min.lua b/contrib/lua-torch/nn/Min.lua
new file mode 100644
index 000000000..3a3e4a802
--- /dev/null
+++ b/contrib/lua-torch/nn/Min.lua
@@ -0,0 +1,66 @@
+local Min, parent = torch.class('nn.Min', 'nn.Module')
+
+function Min:__init(dimension, nInputDims)
+   parent.__init(self)
+   dimension = dimension or 1
+   self.dimension = dimension
+   -- do not assign default value to nInputDims or it will break backward compatibility
+   self.nInputDims = nInputDims
+end
+
+function Min:_getPositiveDimension(input)
+   local dimension = self.dimension
+   if dimension < 0 then
+      dimension = input:dim() + dimension + 1
+   elseif self.nInputDims and input:dim()==(self.nInputDims+1) then
+      dimension = dimension + 1
+   end
+   return dimension
+end
+
+function Min:_lazyInit()
+   self._output = self._output or self.output.new()
+   if not self._indices then
+      if torch.typename(self.output):find('torch%.Cuda.*Tensor') then
+         self._indices = torch.CudaLongTensor and torch.CudaLongTensor() or torch.CudaTensor()
+      else
+         self._indices = torch.LongTensor()
+      end
+   end
+end
+
+function Min:updateOutput(input)
+   self:_lazyInit()
+   local dimension = self:_getPositiveDimension(input)
+   torch.min(self._output, self._indices, input, dimension)
+   if input:dim() > 1 then
+     self.output:set(self._output:select(dimension, 1))
+   else
+     self.output:set(self._output)
+   end
+   return self.output
+end
+
+function Min:updateGradInput(input, gradOutput)
+   self:_lazyInit()
+   local dimension = self:_getPositiveDimension(input)
+   local gradOutputView
+   if input:dim() > 1 then
+     gradOutputView = nn.utils.addSingletonDimension(gradOutput, dimension)
+   else
+     gradOutputView = gradOutput
+   end
+   self.gradInput:resizeAs(input):zero():scatter(dimension, self._indices, gradOutputView)
+   return self.gradInput
+end
+
+function Min:type(type, tensorCache)
+    self._indices = nil
+    parent.type(self, type, tensorCache)
+    return self
+end
+
+function Min:clearState()
+   nn.utils.clear(self, '_indices', '_output')
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/MixtureTable.lua b/contrib/lua-torch/nn/MixtureTable.lua
new file mode 100644
index 000000000..dbe19742f
--- /dev/null
+++ b/contrib/lua-torch/nn/MixtureTable.lua
@@ -0,0 +1,165 @@
+local MixtureTable, parent = torch.class('nn.MixtureTable', 'nn.Module')
+
+function MixtureTable:__init(dim)
+   parent.__init(self)
+   self.dim = dim
+   self.size = torch.LongStorage()
+   self.batchSize = 0
+   self.size2 = torch.LongStorage()
+   self.backwardSetup = false
+   self.gradInput = {}
+end
+
+function MixtureTable:updateOutput(input)
+   local gaterInput, expertInputs = table.unpack(input)
+
+   -- buffers
+   self._gaterView = self._gaterView or input[1].new()
+   self._expert = self._expert or input[1].new()
+   self._expertView = self._expertView or input[1].new()
+
+   self.dimG = 2
+   local batchSize = gaterInput:size(1)
+   if gaterInput:dim() < 2 then
+      self.dimG = 1
+      self.dim = self.dim or 1
+      batchSize = 1
+   end
+   self.dim = self.dim or 2
+
+   if self.table or torch.type(expertInputs) == 'table' then
+      -- expertInputs is a Table :
+      self.table = true
+      if gaterInput:size(self.dimG) ~= #expertInputs then
+         error"Should be one gater output per expert"
+      end
+      local expertInput = expertInputs[1]
+      self.size:resize(expertInput:dim()+1):fill(1)
+      if self.dimG > 1 then
+         self.size[1] = gaterInput:size(1)
+      end
+      self.size[self.dim] = gaterInput:size(self.dimG)
+      self.output:resizeAs(expertInput)
+      self.batchSize = batchSize
+      self._gaterView:view(gaterInput, self.size)
+      self.output:zero()
+      -- multiply accumulate gater outputs by their commensurate expert
+      for i,expertInput in ipairs(expertInputs) do
+         local gate = self._gaterView:select(self.dim,i):expandAs(expertInput)
+         self.output:addcmul(expertInput, gate)
+      end
+   else
+      -- expertInputs is a Tensor :
+      self.size:resize(expertInputs:dim()):fill(1)
+      if self.dimG > 1 then
+         self.size[1] = gaterInput:size(1)
+      end
+      self.size[self.dim] = gaterInput:size(self.dimG)
+      self.output:resizeAs(expertInputs:select(self.dim, 1))
+      self.batchSize = batchSize
+      self._gaterView:view(gaterInput, self.size)
+      self._expert:cmul(self._gaterView:expandAs(expertInputs), expertInputs)
+      self.output:sum(self._expert, self.dim)
+      self.output:resizeAs(expertInputs:select(self.dim, 1))
+   end
+
+   return self.output
+end
+
+function MixtureTable:updateGradInput(input, gradOutput)
+   local gaterInput, expertInputs = table.unpack(input)
+   nn.utils.recursiveResizeAs(self.gradInput, input)
+   local gaterGradInput, expertGradInputs = table.unpack(self.gradInput)
+
+   -- buffers
+   self._sum = self._sum or input[1].new()
+   self._expertView2 = self._expertView2 or input[1].new()
+   self._expert2 = self._expert2 or input[1].new()
+
+   if self.table then
+      for i,expertInput in ipairs(expertInputs) do
+         local expertGradInput = expertGradInputs[i] or expertInput:clone()
+         expertGradInput:resizeAs(expertInput)
+         expertGradInputs[i] = expertGradInput
+      end
+      gaterGradInput:resizeAs(gaterInput)
+
+      -- Clear invalid gradients
+      if #expertGradInputs > #expertInputs then
+         for i=#expertInputs+1, #expertGradInputs do
+            expertGradInputs[i] = nil
+         end
+      end
+
+      -- like CMulTable, but with broadcasting
+      for i,expertGradInput in ipairs(expertGradInputs) do
+         -- gater updateGradInput
+         self._expert:cmul(gradOutput, expertInputs[i])
+         if self.dimG == 1 then
+            self._expertView:view(self._expert, -1)
+         else
+            self._expertView:view(self._expert, gradOutput:size(1), -1)
+         end
+         self._sum:sum(self._expertView, self.dimG)
+         if self.dimG == 1 then
+            gaterGradInput[i] = self._sum:select(self.dimG,1)
+         else
+            gaterGradInput:select(self.dimG,i):copy(self._sum:select(self.dimG,1))
+         end
+
+         -- expert updateGradInput
+         local gate = self._gaterView:select(self.dim,i):expandAs(expertGradInput)
+         expertGradInput:cmul(gate, gradOutput)
+      end
+   else
+      self.size2:resize(expertInputs:dim())
+      self.size2:copy(expertInputs:size())
+      self.size2[self.dim] = 1
+      gaterGradInput:resizeAs(gaterInput)
+
+      -- gater updateGradInput
+      self._expertView:view(gradOutput, self.size2)
+      local gradOutput = self._expertView:expandAs(expertInputs)
+      self._expert:cmul(gradOutput, expertInputs)
+      local expert = self._expert:transpose(self.dim, self.dimG)
+      if not expert:isContiguous() then
+         self._expert2:resizeAs(expert)
+         self._expert2:copy(expert)
+         expert = self._expert2
+      end
+      if self.dimG == 1 then
+         self._expertView2:view(expert, gaterInput:size(1), -1)
+      else
+         self._expertView2:view(expert, gaterInput:size(1), gaterInput:size(2), -1)
+      end
+      gaterGradInput:sum(self._expertView2, self.dimG+1)
+      gaterGradInput:resizeAs(gaterInput)
+
+      -- expert updateGradInput
+      expertGradInputs:cmul(self._gaterView:expandAs(expertInputs), gradOutput)
+   end
+
+   return self.gradInput
+end
+
+function MixtureTable:type(type, tensorCache)
+   self._gaterView = nil
+   self._expert = nil
+   self._expertView = nil
+   self._sum = nil
+   self._expert2 = nil
+   self._expertView2 = nil
+   return parent.type(self, type, tensorCache)
+end
+
+function MixtureTable:clearState()
+   nn.utils.clear(self, {
+     '_gaterView',
+     '_expert',
+     '_expertView',
+     '_sum',
+     '_expert2',
+     '_expertView2',
+   })
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/Module.lua b/contrib/lua-torch/nn/Module.lua
new file mode 100644
index 000000000..3debc5789
--- /dev/null
+++ b/contrib/lua-torch/nn/Module.lua
@@ -0,0 +1,429 @@
+local Module = torch.class('nn.Module')
+
+function Module:__init()
+   self.gradInput = torch.Tensor()
+   self.output = torch.Tensor()
+   self._type = self.output:type()
+end
+
+function Module:parameters()
+   if self.weight and self.bias then
+      return {self.weight, self.bias}, {self.gradWeight, self.gradBias}
+   elseif self.weight then
+      return {self.weight}, {self.gradWeight}
+   elseif self.bias then
+      return {self.bias}, {self.gradBias}
+   else
+      return
+   end
+end
+
+function Module:updateOutput(input)
+   return self.output
+end
+
+function Module:forward(input)
+   return self:updateOutput(input)
+end
+
+function Module:backward(input, gradOutput, scale)
+   scale = scale or 1
+   self:updateGradInput(input, gradOutput)
+   self:accGradParameters(input, gradOutput, scale)
+   return self.gradInput
+end
+
+function Module:backwardUpdate(input, gradOutput, lr)
+   self:updateGradInput(input, gradOutput)
+   self:accUpdateGradParameters(input, gradOutput, lr)
+   return self.gradInput
+end
+
+function Module:updateGradInput(input, gradOutput)
+   return self.gradInput
+end
+
+function Module:accGradParameters(input, gradOutput, scale)
+end
+
+function Module:accUpdateGradParameters(input, gradOutput, lr)
+   if self.shared then
+      self:sharedAccUpdateGradParameters(input, gradOutput, lr)
+   else
+      self:defaultAccUpdateGradParameters(input, gradOutput, lr)
+   end
+end
+
+function Module:defaultAccUpdateGradParameters(input, gradOutput, lr)
+   local gradWeight = self.gradWeight
+   local gradBias = self.gradBias
+   self.gradWeight = self.weight
+   self.gradBias = self.bias
+   self:accGradParameters(input, gradOutput, -lr)
+   self.gradWeight = gradWeight
+   self.gradBias = gradBias
+end
+
+function Module:sharedAccUpdateGradParameters(input, gradOutput, lr)
+   if self:parameters() then
+      self:zeroGradParameters()
+      self:accGradParameters(input, gradOutput, 1)
+      self:updateParameters(lr)
+   end
+end
+
+function Module:zeroGradParameters()
+   local _,gradParams = self:parameters()
+   if gradParams then
+      for i=1,#gradParams do
+         gradParams[i]:zero()
+      end
+   end
+end
+
+function Module:updateParameters(learningRate)
+   local params, gradParams = self:parameters()
+   if params then
+      for i=1,#params do
+         params[i]:add(-learningRate, gradParams[i])
+      end
+   end
+end
+
+function Module:training()
+   self.train = true
+end
+
+function Module:evaluate()
+   self.train = false
+end
+
+function Module:share(mlp, ...)
+   local arg = {...}
+   for i,v in ipairs(arg) do
+      if self[v] ~= nil then
+         self[v]:set(mlp[v])
+         self.shared = true
+         mlp.shared = true
+      end
+   end
+   return self
+end
+
+local function sharedWrite(...)
+   local arg = {...}
+   local shared = {}
+   for i,v in ipairs(arg) do
+       shared[v] = true
+   end
+   return function(self, file)
+      local object = {}
+      for k, v in pairs(self) do
+         if shared[k] then
+            assert(torch.isTensor(v), 'Shared parameters have to be Tensors')
+            object[k] = v.new()
+         else
+            object[k] = v
+         end
+      end
+      file:writeObject(object)
+   end
+end
+
+function Module:clone(...)
+   local oldWrite = nn.Module.write
+   nn.Module.write = sharedWrite(...)
+
+   local f = torch.MemoryFile("rw"):binary()
+   f:writeObject(self)
+   f:seek(1)
+   local clone = f:readObject()
+   f:close()
+
+   nn.Module.write = oldWrite
+
+   if select('#',...) > 0 then
+      clone:share(self,...)
+   end
+   return clone
+end
+
+function Module:type(type, tensorCache)
+   if not type then
+      return self._type
+   end
+
+   tensorCache = tensorCache or {}
+
+   -- find all tensors and convert them
+   for key,param in pairs(self) do
+      self[key] = nn.utils.recursiveType(param, type, tensorCache)
+   end
+
+   self._type = type
+   return self
+end
+
+function Module:float(...)
+   return self:type('torch.FloatTensor',...)
+end
+
+function Module:double(...)
+   return self:type('torch.DoubleTensor',...)
+end
+
+function Module:cuda(...)
+   return self:type('torch.CudaTensor',...)
+end
+
+function Module:reset()
+end
+
+function Module:write(file)
+  -- Write all values in the object as a table.
+  local object = {}
+  for k, v in pairs(self) do
+    object[k] = v
+  end
+  file:writeObject(object)
+end
+
+function Module:read(file)
+  local object = file:readObject()
+  for k, v in pairs(object) do
+    self[k] = v
+  end
+end
+
+-- This function is not easy to understand. It works as follows:
+--
+-- - gather all parameter tensors for this module (and children);
+--   count all parameter values (floats)
+-- - create one ginormous memory area (Storage object) with room for all
+--   parameters
+-- - remap each parameter tensor to point to an area within the ginormous
+--   Storage, and copy it there
+--
+-- It has the effect of making all parameters point to the same memory area,
+-- which is then returned.
+--
+-- The purpose is to allow operations over all parameters (such as momentum
+-- updates and serialization), but it assumes that all parameters are of
+-- the same type (and, in the case of CUDA, on the same device), which
+-- is not always true. Use for_each() to iterate over this module and
+-- children instead.
+--
+-- Module._flattenTensorBuffer can be used by other packages (e.g. cunn)
+-- to specify the type of temporary buffers. For example, the temporary
+-- buffers for CudaTensor could be FloatTensor, to avoid GPU memory usage.
+--
+-- TODO: This logically belongs to torch.Tensor, not nn.
+Module._flattenTensorBuffer = {}
+function Module.flatten(parameters)
+
+   -- returns true if tensor occupies a contiguous region of memory (no holes)
+   local function isCompact(tensor)
+      local sortedStride, perm = torch.sort(
+            torch.LongTensor(tensor:nDimension()):set(tensor:stride()), 1, true)
+      local sortedSize = torch.LongTensor(tensor:nDimension()):set(
+            tensor:size()):index(1, perm)
+      local nRealDim = torch.clamp(sortedStride, 0, 1):sum()
+      sortedStride = sortedStride:narrow(1, 1, nRealDim):clone()
+      sortedSize   = sortedSize:narrow(1, 1, nRealDim):clone()
+      local t = tensor.new():set(tensor:storage(), 1,
+                                 sortedSize:storage(),
+                                 sortedStride:storage())
+      return t:isContiguous()
+   end
+
+   if not parameters or #parameters == 0 then
+      return torch.Tensor()
+   end
+   local Tensor = parameters[1].new
+   local TmpTensor = Module._flattenTensorBuffer[torch.type(parameters[1])] or Tensor
+
+   -- 1. construct the set of all unique storages referenced by parameter tensors
+   local storages = {}
+   local nParameters = 0
+   local parameterMeta = {}
+   for k = 1,#parameters do
+      local param = parameters[k]
+      local storage = parameters[k]:storage()
+      local storageKey = torch.pointer(storage)
+
+      if not storages[storageKey] then
+         storages[storageKey] = {storage, nParameters}
+         nParameters = nParameters + storage:size()
+      end
+
+      parameterMeta[k] = {storageOffset = param:storageOffset() +
+                                          storages[storageKey][2],
+                          size          = param:size(),
+                          stride        = param:stride()}
+   end
+
+   -- 2. construct a single tensor that will hold all the parameters
+   local flatParameters = TmpTensor(nParameters):zero()
+
+   -- 3. determine if there are elements in the storage that none of the
+   --    parameter tensors reference ('holes')
+   local tensorsCompact = true
+   for k = 1,#parameters do
+      local meta = parameterMeta[k]
+      local tmp = TmpTensor():set(
+         flatParameters:storage(), meta.storageOffset, meta.size, meta.stride)
+      tmp:fill(1)
+      tensorsCompact = tensorsCompact and isCompact(tmp)
+   end
+
+   local maskParameters  = flatParameters:byte():clone()
+   local compactOffsets  = flatParameters:long():cumsum(1)
+   local nUsedParameters = compactOffsets[-1]
+
+   -- 4. copy storages into the flattened parameter tensor
+   for _, storageAndOffset in pairs(storages) do
+      local storage, offset = table.unpack(storageAndOffset)
+      flatParameters[{{offset+1,offset+storage:size()}}]:copy(Tensor():set(storage))
+   end
+
+   -- 5. allow garbage collection
+   storages = nil
+   for k = 1,#parameters do
+       parameters[k]:set(Tensor())
+   end
+
+   -- 6. compact the flattened parameters if there were holes
+   if nUsedParameters ~= nParameters then
+      assert(tensorsCompact,
+         "Cannot gather tensors that are not compact")
+
+      flatParameters = TmpTensor(nUsedParameters):copy(
+            flatParameters:maskedSelect(maskParameters))
+      for k = 1,#parameters do
+        parameterMeta[k].storageOffset =
+              compactOffsets[parameterMeta[k].storageOffset]
+      end
+   end
+
+   if TmpTensor ~= Tensor then
+      flatParameters = Tensor(flatParameters:nElement()):copy(flatParameters)
+   end
+
+   -- 7. fix up the parameter tensors to point at the flattened parameters
+   for k = 1,#parameters do
+      parameters[k]:set(flatParameters:storage(),
+          parameterMeta[k].storageOffset,
+          parameterMeta[k].size,
+          parameterMeta[k].stride)
+   end
+
+   return flatParameters
+end
+
+function Module:getParameters()
+   -- get parameters
+   local parameters,gradParameters = self:parameters()
+   local p, g = Module.flatten(parameters), Module.flatten(gradParameters)
+   assert(p:nElement() == g:nElement(),
+      'check that you are sharing parameters and gradParameters')
+   if parameters then
+      for i=1,#parameters do
+         assert(parameters[i]:storageOffset() == gradParameters[i]:storageOffset(),
+            'misaligned parameter at ' .. tostring(i))
+      end
+   end
+   return p, g
+end
+
+function Module:__call__(input, gradOutput)
+   self:forward(input)
+   if gradOutput then
+      self:backward(input, gradOutput)
+      return self.output, self.gradInput
+   else
+      return self.output
+   end
+end
+
+-- Run a callback (called with the module as an argument) in preorder over this
+-- module and its children.
+--
+function Module:apply(callback)
+    callback(self)
+
+    if self.modules then
+        for _, module in ipairs(self.modules) do
+            module:apply(callback)
+        end
+    end
+end
+
+function Module:findModules(typename, container)
+  container = container or self
+  local nodes = {}
+  local containers = {}
+  local mod_type = torch.typename(self)
+  if mod_type == typename then
+    nodes[#nodes+1] = self
+    containers[#containers+1] = container
+  end
+  -- Recurse on nodes with 'modules'
+  if (self.modules ~= nil) then
+    if (torch.type(self.modules) == 'table') then
+      for i = 1, #self.modules do
+        local child = self.modules[i]
+        local cur_nodes, cur_containers =
+          child:findModules(typename, self)
+        assert(#cur_nodes == #cur_containers,
+          'Internal error: incorrect return length')  -- This shouldn't happen
+        -- add the list items from our child to our list (ie return a
+        -- flattened table of the return nodes).
+        for j = 1, #cur_nodes do
+          nodes[#nodes+1] = cur_nodes[j]
+          containers[#containers+1] = cur_containers[j]
+        end
+      end
+    end
+  end
+  return nodes, containers
+end
+
+-- returns a list of modules
+function Module:listModules()
+   local function tinsert(to, from)
+      if torch.type(from) == 'table' then
+         for i=1,#from do
+            tinsert(to,from[i])
+         end
+      else
+         table.insert(to,from)
+      end
+   end
+   -- include self first
+   local modules = {self}
+   if self.modules then
+      for i=1,#self.modules do
+         local modulas = self.modules[i]:listModules()
+         if modulas then
+            tinsert(modules,modulas)
+         end
+      end
+   end
+   return modules
+end
+
+function Module:clearState()
+   return nn.utils.clear(self, 'output', 'gradInput')
+end
+
+-- similar to apply, recursively goes over network and calls
+-- a callback function which returns a new module replacing the old one
+function nn.Module:replace(callback)
+   local out = callback(self)
+   if self.modules then
+      for i, module in ipairs(self.modules) do
+         self.modules[i] = module:replace(callback)
+      end
+   end
+   return out
+end
diff --git a/contrib/lua-torch/nn/ModuleCriterion.lua b/contrib/lua-torch/nn/ModuleCriterion.lua
new file mode 100644
index 000000000..bfc79ef55
--- /dev/null
+++ b/contrib/lua-torch/nn/ModuleCriterion.lua
@@ -0,0 +1,44 @@
+local ModuleCriterion, parent = torch.class("nn.ModuleCriterion", "nn.Criterion")
+
+function ModuleCriterion:__init(criterion, inputModule, targetModule, castTarget)
+   self.inputModule = inputModule
+   self.targetModule = targetModule
+   self.castTarget = (castTarget == nil) and true or castTarget
+   if self.inputModule then
+      local params = self.inputModule:parameters()
+      if params and #params > 0 then
+         print"Warning: nn.ModuleCriterion doesn't support parameter updates"
+      end
+   end
+   self.criterion = criterion
+end
+
+function ModuleCriterion:updateOutput(input, target)
+   if self.inputModule then
+      self.input = self.inputModule:forward(input)
+   end
+   if self.targetModule then
+      self.target = self.targetModule:forward(target)
+   end
+   self.output = self.criterion:forward(self.input or input, self.target or target)
+   return self.output
+end
+
+function ModuleCriterion:updateGradInput(input, target)
+   self.gradInput = self.criterion:backward(self.input or input, self.target or target)
+   if self.inputModule then
+      self.gradInput = self.inputModule:backward(input, self.gradInput)
+   end
+   return self.gradInput
+end
+
+function ModuleCriterion:type(type, typecache)
+   if self.inputModule then
+      self.inputModule:type(type, typecache)
+   end
+   if self.castTarget and self.targetModule then
+      self.targetModule:type(type, typecache)
+   end
+   self.criterion:type(type, typecache)
+   return parent.type(self, type, typecache)
+end
diff --git a/contrib/lua-torch/nn/Mul.lua b/contrib/lua-torch/nn/Mul.lua
new file mode 100644
index 000000000..efa1db656
--- /dev/null
+++ b/contrib/lua-torch/nn/Mul.lua
@@ -0,0 +1,38 @@
+local Mul, parent = torch.class('nn.Mul', 'nn.Module')
+
+function Mul:__init()
+   parent.__init(self)
+
+   self.weight = torch.Tensor(1)
+   self.gradWeight = torch.Tensor(1)
+
+   self:reset()
+end
+
+
+function Mul:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1./math.sqrt(self.weight:size(1))
+   end
+
+   self.weight:uniform(-stdv, stdv);
+end
+
+function Mul:updateOutput(input)
+   self.output:resizeAs(input):copy(input);
+   self.output:mul(self.weight[1]);
+   return self.output
+end
+
+function Mul:updateGradInput(input, gradOutput)
+   self.gradInput:resizeAs(input):zero()
+   self.gradInput:add(self.weight[1], gradOutput)
+   return self.gradInput
+end
+
+function Mul:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   self.gradWeight[1] = self.gradWeight[1] + scale*input:dot(gradOutput);
+end
diff --git a/contrib/lua-torch/nn/MulConstant.lua b/contrib/lua-torch/nn/MulConstant.lua
new file mode 100644
index 000000000..e8c473bee
--- /dev/null
+++ b/contrib/lua-torch/nn/MulConstant.lua
@@ -0,0 +1,41 @@
+local MulConstant, parent = torch.class('nn.MulConstant', 'nn.Module')
+
+function MulConstant:__init(constant_scalar,ip)
+  parent.__init(self)
+  assert(type(constant_scalar) == 'number', 'input is not scalar!')
+  self.constant_scalar = constant_scalar
+
+  -- default for inplace is false
+   self.inplace = ip or false
+   if (ip and type(ip) ~= 'boolean') then
+      error('in-place flag must be boolean')
+   end
+end
+
+function MulConstant:updateOutput(input)
+  if self.inplace then
+    input:mul(self.constant_scalar)
+    self.output:set(input)
+  else
+    self.output:resizeAs(input)
+    self.output:copy(input)
+    self.output:mul(self.constant_scalar)
+  end
+  return self.output
+end
+
+function MulConstant:updateGradInput(input, gradOutput)
+  if self.gradInput then
+    if self.inplace then
+      gradOutput:mul(self.constant_scalar)
+      self.gradInput:set(gradOutput)
+      -- restore previous input value
+      input:div(self.constant_scalar)
+    else
+      self.gradInput:resizeAs(gradOutput)
+      self.gradInput:copy(gradOutput)
+      self.gradInput:mul(self.constant_scalar)
+    end
+    return self.gradInput
+  end
+end
diff --git a/contrib/lua-torch/nn/MultiCriterion.lua b/contrib/lua-torch/nn/MultiCriterion.lua
new file mode 100644
index 000000000..959317711
--- /dev/null
+++ b/contrib/lua-torch/nn/MultiCriterion.lua
@@ -0,0 +1,40 @@
+local MultiCriterion, parent = torch.class('nn.MultiCriterion', 'nn.Criterion')
+
+function MultiCriterion:__init()
+   parent.__init(self)
+   self.criterions = {}
+   self.weights = torch.DoubleStorage()
+end
+
+function MultiCriterion:add(criterion, weight)
+   assert(criterion, 'no criterion provided')
+   weight = weight or 1
+   table.insert(self.criterions, criterion)
+   self.weights:resize(#self.criterions, true)
+   self.weights[#self.criterions] = weight
+   return self
+end
+
+function MultiCriterion:updateOutput(input, target)
+   self.output = 0
+   for i=1,#self.criterions do
+      self.output = self.output + self.weights[i]*self.criterions[i]:updateOutput(input, target)
+   end
+   return self.output
+end
+
+function MultiCriterion:updateGradInput(input, target)
+   self.gradInput = nn.utils.recursiveResizeAs(self.gradInput, input)
+   nn.utils.recursiveFill(self.gradInput, 0)
+   for i=1,#self.criterions do
+      nn.utils.recursiveAdd(self.gradInput, self.weights[i], self.criterions[i]:updateGradInput(input, target))
+   end
+   return self.gradInput
+end
+
+function MultiCriterion:type(type)
+   for i,criterion in ipairs(self.criterions) do
+      criterion:type(type)
+   end
+   return parent.type(self, type)
+end
diff --git a/contrib/lua-torch/nn/MultiLabelMarginCriterion.lua b/contrib/lua-torch/nn/MultiLabelMarginCriterion.lua
new file mode 100644
index 000000000..908b6133c
--- /dev/null
+++ b/contrib/lua-torch/nn/MultiLabelMarginCriterion.lua
@@ -0,0 +1,41 @@
+local MultiLabelMarginCriterion, parent = torch.class('nn.MultiLabelMarginCriterion', 'nn.Criterion')
+
+function MultiLabelMarginCriterion:__init()
+   parent.__init(self)
+   self.sizeAverage = true
+   self.isTarget = torch.Tensor()
+end
+
+function MultiLabelMarginCriterion:updateOutput(input, target)
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
+     target = torch.CudaLongTensor and target:cudaLong() or target
+   else
+     target = target:long()
+   end
+   self.output_tensor = self.output_tensor or input.new(1)
+   input.THNN.MultiLabelMarginCriterion_updateOutput(
+      input:cdata(),
+      target:cdata(),
+      self.output_tensor:cdata(),
+      self.isTarget:cdata(),
+      self.sizeAverage
+   )
+   self.output = self.output_tensor[1]
+   return self.output
+end
+
+function MultiLabelMarginCriterion:updateGradInput(input, target)
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
+     target = torch.CudaLongTensor and target:cudaLong() or target
+   else
+     target = target:long()
+   end
+   input.THNN.MultiLabelMarginCriterion_updateGradInput(
+      input:cdata(),
+      target:cdata(),
+      self.gradInput:cdata(),
+      self.isTarget:cdata(),
+      self.sizeAverage
+   )
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/MultiLabelSoftMarginCriterion.lua b/contrib/lua-torch/nn/MultiLabelSoftMarginCriterion.lua
new file mode 100644
index 000000000..9d471d449
--- /dev/null
+++ b/contrib/lua-torch/nn/MultiLabelSoftMarginCriterion.lua
@@ -0,0 +1,86 @@
+--[[
+-- A MultiLabel multiclass criterion based on sigmoid:
+--
+-- the loss is:
+-- l(x,y) = - sum_i y[i] * log(p[i]) + (1 - y[i]) * log (1 - p[i])
+-- where p[i] = exp(x[i]) / (1 + exp(x[i]))
+--
+-- and with weights:
+-- l(x,y) = - sum_i weights[i] (y[i] * log(p[i]) + (1 - y[i]) * log (1 - p[i]))
+--
+-- This uses the stable form of the loss and gradients.
+--]]
+
+
+local MultiLabelSoftMarginCriterion, parent = torch.class('nn.MultiLabelSoftMarginCriterion', 'nn.Criterion')
+
+
+function MultiLabelSoftMarginCriterion:__init(weights, sizeAverage)
+   parent.__init(self)
+   if sizeAverage ~= nil then
+      self.sizeAverage = sizeAverage
+   else
+      self.sizeAverage = true
+   end
+   if weights ~= nil then
+      assert(weights:dim() == 1, "weights input should be 1-D Tensor")
+      self.weights = weights
+   end
+   self.sigmoid = nn.Sigmoid()
+end
+
+function MultiLabelSoftMarginCriterion:updateOutput(input, target)
+   local weights = self.weights
+   if weights ~= nil and target:dim() ~= 1 then
+      weights = self.weights:view(1, target:size(2)):expandAs(target)
+   end
+
+   local x = input:view(input:nElement())
+   local t = target:view(target:nElement())
+
+   self.sigmoid:updateOutput(x)
+
+   self._buffer1 = self._buffer1 or input.new()
+   self._buffer2 = self._buffer2 or input.new()
+
+   self._buffer1:ge(x, 0) -- indicator
+
+   -- log(1 + exp(x - cmul(x, indicator):mul(2)))
+   self._buffer2:cmul(x, self._buffer1):mul(-2):add(x):exp():add(1):log()
+   -- cmul(x, t - indicator)
+   self._buffer1:mul(-1):add(t):cmul(x)
+   -- log(1 + exp(x - cmul(x, indicator):mul(2))) - cmul(x, t - indicator)
+   self._buffer2:add(-1, self._buffer1)
+
+   if weights ~= nil then
+      self._buffer2:cmul(weights)
+   end
+
+   self.output = self._buffer2:sum()
+
+   if self.sizeAverage then
+      self.output = self.output / input:nElement()
+   end
+
+   return self.output
+end
+
+function MultiLabelSoftMarginCriterion:updateGradInput(input, target)
+   local weights = self.weights
+   if weights ~= nil and target:dim() ~= 1 then
+      weights = self.weights:view(1, target:size(2)):expandAs(target)
+   end
+
+   self.gradInput:resizeAs(input):copy(self.sigmoid.output)
+   self.gradInput:add(-1, target)
+
+   if weights ~= nil then
+      self.gradInput:cmul(weights)
+   end
+
+   if self.sizeAverage then
+      self.gradInput:div(target:nElement())
+   end
+
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/MultiMarginCriterion.lua b/contrib/lua-torch/nn/MultiMarginCriterion.lua
new file mode 100644
index 000000000..e3122386a
--- /dev/null
+++ b/contrib/lua-torch/nn/MultiMarginCriterion.lua
@@ -0,0 +1,64 @@
+local THNN = require 'nn.THNN'
+local MultiMarginCriterion, parent = torch.class('nn.MultiMarginCriterion', 'nn.Criterion')
+
+function MultiMarginCriterion:__init(p, weights, margin)
+   assert(p == nil or p == 1 or p == 2, 'only p=1 and p=2 supported')
+   self.p = p or 1
+   self.margin = margin or 1.0
+   parent.__init(self)
+   self.sizeAverage = true
+   if weights then
+       assert(weights:dim() == 1, "weights input should be 1-D Tensor")
+       self.weights = weights
+   end
+end
+
+function MultiMarginCriterion:updateOutput(input, target)
+   -- backward compatibility
+   if not torch.isTensor(target) then
+     self.target_tensor = self.target_tensor or torch.LongTensor(1)
+     self.target_tensor[1] = target
+     target = self.target_tensor
+   end
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
+     target = torch.CudaLongTensor and target:cudaLong() or target
+   else
+     target = target:long()
+   end
+   self.p = self.p or 1
+   self.output_tensor = self.output_tensor or input.new(1)
+   input.THNN.MultiMarginCriterion_updateOutput(
+      input:cdata(),
+      target:cdata(),
+      self.output_tensor:cdata(),
+      self.sizeAverage,
+      self.p,
+      THNN.optionalTensor(self.weights),
+      self.margin
+   )
+   self.output = self.output_tensor[1]
+   return self.output
+end
+
+function MultiMarginCriterion:updateGradInput(input, target)
+   if not torch.isTensor(target) then
+     self.target_tensor = self.target_tensor or torch.LongTensor(1)
+     self.target_tensor[1] = target
+     target = self.target_tensor
+   end
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
+     target = torch.CudaLongTensor and target:cudaLong() or target
+   else
+     target = target:long()
+   end
+   input.THNN.MultiMarginCriterion_updateGradInput(
+      input:cdata(),
+      target:cdata(),
+      self.gradInput:cdata(),
+      self.sizeAverage,
+      self.p,
+      THNN.optionalTensor(self.weights),
+      self.margin
+   )
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/NaN.lua b/contrib/lua-torch/nn/NaN.lua
new file mode 100644
index 000000000..b80f6a04d
--- /dev/null
+++ b/contrib/lua-torch/nn/NaN.lua
@@ -0,0 +1,72 @@
+------------------------------------------------------------------------
+--[[ NaN ]]--
+-- Asserts that outputs and gradInputs do not contain NaNs.
+-- Useful for locating the source of NaN errors.
+------------------------------------------------------------------------
+local NaN, parent = torch.class("nn.NaN", "nn.Decorator")
+
+local idseq = 0
+function NaN.newId()
+   idseq = idseq + 1
+   return idseq
+end
+
+function NaN:__init(module, id)
+   parent.__init(self, module)
+   self.id = id or NaN.newId()
+end
+
+function NaN:recursiveIsNaN(tensor)
+   local isNaN = false
+   if torch.type(tensor) == 'table' then
+      for k,v in pairs(tensor) do
+         isNaN = self:recursiveIsNaN(v)
+         if isNaN then break end
+      end
+   else
+      local _ = require 'moses'
+      isNaN = _.isNaN(tensor:sum())
+   end
+   return isNaN
+end
+
+function NaN:updateOutput(input)
+   self.output = self.modules[1]:updateOutput(input)
+   if self:recursiveIsNaN(self.output) then
+      if self:recursiveIsNaN(input) then
+         error(string.format("NaN found in input of module :\n%s", self:__tostring__()))
+      elseif self:recursiveIsNaN(self:parameters()) then
+         error(string.format("NaN found in parameters of module :\n%s", self:__tostring__()))
+      end
+      error(string.format("NaN found in output of module :\n%s", self:__tostring__()))
+   end
+   return self.output
+end
+
+function NaN:updateGradInput(input, gradOutput)
+   self.gradInput = self.modules[1]:updateGradInput(input, gradOutput)
+   if self:recursiveIsNaN(self.gradInput) then
+      if self:recursiveIsNaN(gradOutput) then
+         error(string.format("NaN found in gradOutput of module :\n%s", self:__tostring__()))
+      end
+      error(string.format("NaN found in gradInput of module :\n%s", self:__tostring__()))
+   end
+   return self.gradInput
+end
+
+function NaN:accGradParameters(input, gradOutput, scale)
+   self.modules[1]:accGradParameters(input, gradOutput, scale)
+   local params, gradParams = self:parameters()
+   if self:recursiveIsNaN(gradParams) then
+      error(string.format("NaN found in gradParameters of module :\n%s", self:__tostring__()))
+   end
+end
+
+function NaN:__tostring__()
+   local selfstring = torch.type(self) .. '(' .. self.id .. ')'
+   if self.modules[1].__tostring__ then
+      return selfstring .. ' @ ' .. self.modules[1]:__tostring__()
+   else
+      return selfstring .. ' @ ' .. torch.type(self.modules[1])
+   end
+end
diff --git a/contrib/lua-torch/nn/Narrow.lua b/contrib/lua-torch/nn/Narrow.lua
new file mode 100644
index 000000000..a6ebaa321
--- /dev/null
+++ b/contrib/lua-torch/nn/Narrow.lua
@@ -0,0 +1,45 @@
+local Narrow, parent = torch.class('nn.Narrow', 'nn.Module')
+
+function Narrow:__init(dimension,offset,length)
+   parent.__init(self)
+   self.dimension=dimension
+   self.index=offset
+   self.length=length or 1
+   if not dimension or not offset then
+      error('nn.Narrow(dimension, offset, length)')
+   end
+end
+
+function Narrow:updateOutput(input)
+   local dim = self.dimension < 0 and input:dim() + self.dimension + 1 or self.dimension
+   local length = self.length
+   if length < 0 then
+      length = input:size(dim) - self.index + self.length + 2
+   end
+   local index = self.index
+   if self.index < 0 then
+      index = 1
+      length = input:size(dim) - length
+   end
+   local output=input:narrow(dim, index, length)
+   self.output = self.output:typeAs(output)
+   self.output:resizeAs(output):copy(output)
+   return self.output
+end
+
+function Narrow:updateGradInput(input, gradOutput)
+   local dim = self.dimension < 0 and input:dim() + self.dimension + 1 or self.dimension
+   local length = self.length
+   if length < 0 then
+      length = input:size(dim) - self.index + self.length + 2
+   end
+   local index = self.index
+   if self.index < 0 then
+      index = 1
+      length = input:size(dim) - length
+   end
+   self.gradInput = self.gradInput:typeAs(input)
+   self.gradInput:resizeAs(input):zero()
+   self.gradInput:narrow(dim,index,length):copy(gradOutput)
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/NarrowTable.lua b/contrib/lua-torch/nn/NarrowTable.lua
new file mode 100644
index 000000000..17429f3b1
--- /dev/null
+++ b/contrib/lua-torch/nn/NarrowTable.lua
@@ -0,0 +1,43 @@
+local NarrowTable, parent = torch.class('nn.NarrowTable', 'nn.Module')
+
+function NarrowTable:__init(offset, length)
+   parent.__init(self)
+   self.offset = offset
+   self.length = length or 1
+   if not offset then
+      error('nn.NarrowTable(offset, length)')
+   end
+
+   self.output = {}
+   self.gradInput = {}
+end
+
+function NarrowTable:updateOutput(input)
+   for k,v in ipairs(self.output) do self.output[k] = nil end
+   for i=1,self.length do
+      self.output[i] = input[self.offset+i-1]
+   end
+   return self.output
+end
+
+function NarrowTable:updateGradInput(input, gradOutput)
+   for i=1,#gradOutput do
+      self.gradInput[self.offset+i-1] = gradOutput[i]
+   end
+   for i=1,#input do
+      if (i < self.offset) or (i >= self.offset + self.length) then
+         self.gradInput[i] = nn.utils.recursiveResizeAs(self.gradInput[i], input[i])
+         nn.utils.recursiveFill(self.gradInput[i], 0)
+      end
+   end
+   for i=#input+1,#self.gradInput do self.gradInput[i] = nil end
+   return self.gradInput
+end
+
+function NarrowTable:type(type, tensorCache)
+   self.output = {}
+   self.gradInput = {}
+   return parent.type(self, type, tensorCache)
+end
+
+NarrowTable.clearState = nn.Identity.clearState
diff --git a/contrib/lua-torch/nn/Normalize.lua b/contrib/lua-torch/nn/Normalize.lua
new file mode 100644
index 000000000..0937ebba9
--- /dev/null
+++ b/contrib/lua-torch/nn/Normalize.lua
@@ -0,0 +1,150 @@
+local Normalize, parent = torch.class('nn.Normalize', 'nn.Module')
+
+function Normalize:__init(p,eps)
+  parent.__init(self)
+  assert(p,'p-norm not provided')
+  assert(p > 0, p..'-norm not supported')
+  self.p = p
+  self.eps = eps or 1e-10
+end
+
+function Normalize:updateOutput(input)
+  assert(input:dim() <= 2, 'only 1d layer supported')
+  local input_size = input:size()
+  if input:dim() == 1 then
+    input = input:view(1,-1)
+  end
+
+  self._output = self._output or input.new()
+  self.norm = self.norm or input.new()
+  self.buffer = self.buffer or input.new()
+
+  self._output:resizeAs(input)
+
+  if self.p == math.huge then
+    -- specialization for the infinity norm
+    if not self._indices then
+      if torch.typename(self.output):find('torch%.Cuda.*Tensor') then
+        self._indices = torch.CudaLongTensor and torch.CudaLongTensor() or torch.CudaTensor()
+      else
+        self._indices = torch.LongTensor()
+      end
+    end
+
+    self.buffer:abs(input)
+    torch.max(self.norm, self._indices, self.buffer, 2)
+    self.norm:add(self.eps)
+  else
+    self.normp = self.normp or input.new()
+    if self.p % 2 ~= 0 then
+      self.buffer:abs(input):pow(self.p)
+    else
+      self.buffer:pow(input,self.p)
+    end
+    self.normp:sum(self.buffer,2):add(self.eps)
+    self.norm:pow(self.normp,1/self.p)
+  end
+  self._output:cdiv(input, self.norm:view(-1,1):expandAs(input))
+
+  self.output:view(self._output, input_size)
+  return self.output
+end
+
+function Normalize:updateGradInput(input, gradOutput)
+  assert(input:dim() <= 2, 'only 1d layer supported')
+  assert(gradOutput:dim() <= 2, 'only 1d layer supported')
+
+  local input_size = input:size()
+  if input:dim() == 1 then
+    input = input:view(1,-1)
+  end
+
+  local n = input:size(1) -- batch size
+  local d = input:size(2) -- dimensionality of vectors
+
+  self._gradInput = self._gradInput or input.new()
+  self.cross = self.cross or input.new()
+  -- compute diagonal term with gradOutput
+  self._gradInput:resize(n,d)
+  if self.p == math.huge then
+    -- specialization for the inf case
+    self._gradInput:cmul(self.norm:view(n,1,1):expand(n,d,1),gradOutput)
+    self.buffer:resizeAs(input):zero()
+    self.cross:resize(n,1)
+    self.cross:gather(input,2,self._indices)
+    self.cross:cdiv(self.norm)
+    self.buffer:scatter(2,self._indices,self.cross)
+  else
+    self._gradInput:cmul(self.normp:view(n,1):expand(n,d), gradOutput)
+    -- small optimizations for different p
+    -- buffer = input*|input|^(p-2)
+    if self.p % 2 ~= 0 then
+      -- for non-even p, need to add absolute value
+      if self.p < 2 then
+        -- add eps to avoid possible division by 0
+        self.buffer:abs(input):add(self.eps):pow(self.p-2):cmul(input)
+      else
+        self.buffer:abs(input):pow(self.p-2):cmul(input)
+      end
+    elseif self.p == 2 then
+      -- special case for p == 2, pow(x,0) = 1
+      self.buffer:copy(input)
+    else
+      -- p is even and > 2, pow(x,p) is always positive
+      self.buffer:pow(input,self.p-2):cmul(input)
+    end
+  end
+  -- compute cross term in two steps
+  self.cross:resize(n,1)
+
+  -- instead of having a huge temporary matrix (b1*b2),
+  -- do the computations as b1*(b2*gradOutput). This avoids redundant
+  -- computation and also a huge buffer of size n*d^2
+  self.buffer2 = self.buffer2 or input.new() -- nxd
+  self.buffer2:cmul(input, gradOutput)
+  self.cross:sum(self.buffer2, 2)
+
+  self.buffer:cmul(self.cross:expandAs(self.buffer))
+  self._gradInput:add(-1, self.buffer)
+
+  -- reuse cross buffer for normalization
+  if self.p == math.huge then
+    self.cross:cmul(self.norm,self.norm)
+  else
+    self.cross:cmul(self.normp,self.norm)
+  end
+  self._gradInput:cdiv(self.cross:expand(n,d))
+
+  self.gradInput:view(self._gradInput, input_size)
+  return self.gradInput
+end
+
+function Normalize:__tostring__()
+  local s
+  -- different prints if the norm is integer
+  if self.p % 1 == 0 then
+    s = '%s(%d)'
+  else
+    s = '%s(%f)'
+  end
+  return string.format(s,torch.type(self),self.p)
+end
+
+function Normalize:type(type, tensorCache)
+    self._indices = nil
+    parent.type(self, type, tensorCache)
+    return self
+end
+
+function Normalize:clearState()
+   nn.utils.clear(self, {
+      '_output',
+      '_indices',
+      '_gradInput',
+      'buffer',
+      'norm',
+      'normp',
+      'cross',
+   })
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/OneHot.lua b/contrib/lua-torch/nn/OneHot.lua
new file mode 100644
index 000000000..d1dc1b52d
--- /dev/null
+++ b/contrib/lua-torch/nn/OneHot.lua
@@ -0,0 +1,69 @@
+local OneHot, parent = torch.class('nn.OneHot', 'nn.Module')
+
+-- adapted from https://github.com/karpathy/char-rnn
+-- and https://github.com/hughperkins/char-lstm
+
+function OneHot:__init(outputSize)
+   parent.__init(self)
+   self.outputSize = outputSize
+end
+
+function OneHot:updateOutput(input)
+   local size
+   if type(input) == 'number' then
+      if self:type() == 'torch.CudaTensor' then
+         self._single = self._single or torch.CudaTensor():resize(1);
+      else
+         self._single = self._single or torch.LongTensor():resize(1);
+      end
+      self._single[1] = input
+      input = self._single;
+      size = {}
+   else
+      size = input:size():totable()
+   end
+   table.insert(size, self.outputSize)
+
+   self.output:resize(table.unpack(size)):zero()
+
+   size[#size] = 1
+   local input_ = input:view(table.unpack(size))
+
+   if torch.type(input) == 'torch.CudaTensor' or torch.type(input) == 'torch.ClTensor' then
+      self.output:scatter(self.output:dim(), input_, 1)
+   else
+      if torch.type(self.output) == 'torch.CudaTensor' then
+         -- input is not cuda, module is, cast input to cuda
+         self._input = self._input or torch.CudaTensor()
+         self._input:resize(input_:size()):copy(input_)
+         input_ = self._input
+      elseif torch.type(input) ~= 'torch.LongTensor' then
+         -- input is not long, module isnot cuda, cast input to long
+         self._input = self._input or torch.LongTensor()
+         self._input:resize(input_:size()):copy(input_)
+         input_ = self._input
+      end
+      self.output:scatter(self.output:dim(), input_, 1)
+   end
+
+   return self.output
+end
+
+function OneHot:updateGradInput(input, gradOutput)
+   if type(input) == 'number' then
+      return 0
+   else
+      self.gradInput:resize(input:size()):zero()
+      return self.gradInput
+   end
+end
+
+function OneHot:clearState()
+   self._single = nil
+   self._input = nil
+end
+
+function OneHot:type(type, typecache)
+   self:clearState()
+   return parent.type(self, type, typecache)
+end
diff --git a/contrib/lua-torch/nn/PReLU.lua b/contrib/lua-torch/nn/PReLU.lua
new file mode 100644
index 000000000..2e58fba4e
--- /dev/null
+++ b/contrib/lua-torch/nn/PReLU.lua
@@ -0,0 +1,52 @@
+local PReLU, parent = torch.class('nn.PReLU','nn.Module')
+
+function PReLU:__init(nOutputPlane)
+   parent.__init(self)
+   -- if no argument provided, use shared model (weight is scalar)
+   self.nOutputPlane = nOutputPlane or 0
+   self.weight = torch.Tensor(nOutputPlane or 1):fill(0.25)
+   self.gradWeight = torch.Tensor(nOutputPlane or 1)
+end
+
+function PReLU:updateOutput(input)
+   input.THNN.PReLU_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.weight:cdata(),
+      self.nOutputPlane
+   )
+   return self.output
+end
+
+function PReLU:updateGradInput(input, gradOutput)
+   input.THNN.PReLU_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.weight:cdata(),
+      self.nOutputPlane
+   )
+   return self.gradInput
+end
+
+function PReLU:accGradParameters(input, gradOutput, scale)
+   self.gradWeightBuf = self.gradWeightBuf or input.new()
+   self.gradWeightBuf2 = self.gradWeightBuf2 or input.new()
+   input.THNN.PReLU_accGradParameters(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.weight:cdata(),
+      self.gradWeight:cdata(),
+      self.gradWeightBuf:cdata(),
+      self.gradWeightBuf2:cdata(),
+      self.nOutputPlane,
+      scale or 1
+   )
+   return self.gradWeight
+end
+
+function PReLU:clearState()
+   nn.utils.clear(self, 'gradWeightBuf', 'gradWeightBuf2')
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/Padding.lua b/contrib/lua-torch/nn/Padding.lua
new file mode 100644
index 000000000..d5f7771d0
--- /dev/null
+++ b/contrib/lua-torch/nn/Padding.lua
@@ -0,0 +1,65 @@
+local Padding, parent = torch.class('nn.Padding', 'nn.Module')
+
+-- pad puts in [pad] amount of [value] over dimension [dim], starting at index [index] in that dimension. If pad<0, index counts from the left.  If pad>0 index counts from the right
+-- index = 1 pads before index 1.  index = 2 pads starting before index 2 and after index 1 in dimension [dim]
+function Padding:__init(dim, pad, nInputDim, value, index)
+   self.value = value or 0
+   self.index = index or 1
+   self.dim = dim
+   self.pad = pad
+   self.nInputDim = nInputDim
+   self.outputSize = torch.LongStorage()
+   parent.__init(self)
+end
+
+function Padding:updateOutput(input)
+   self.outputSize:resize(input:dim())
+   self.outputSize:copy(input:size())
+   local dim = self.dim
+   if self.nInputDim and input:dim() ~= self.nInputDim then
+      dim = dim + 1
+   end
+   self.outputSize[dim] = self.outputSize[dim] + math.abs(self.pad)
+   self.output:resize(self.outputSize)
+   self.output:fill(self.value)
+   local index = self.index
+   local pad = self.pad
+   if pad > 0 then
+      index = input:size(dim) - index + 2
+   else
+      pad = -pad
+   end
+   if index == 1 then
+      self.output:narrow(dim, 1 + pad, input:size(dim)):copy(input)
+   elseif index == input:size(dim) + 1 then
+      self.output:narrow(dim, 1, input:size(dim)):copy(input)
+   else
+      self.output:narrow(dim, 1, index - 1):copy(input:narrow(dim, 1, index - 1))
+      self.output:narrow(dim, index + pad, input:size(dim) - (index - 1)):copy(input:narrow(dim, index, input:size(dim) - (index - 1)))
+   end
+   return self.output
+end
+
+function Padding:updateGradInput(input, gradOutput)
+   self.gradInput:resizeAs(input)
+   local dim = self.dim
+   if self.nInputDim and input:dim() ~= self.nInputDim then
+      dim = dim + 1
+   end
+   local index = self.index
+   local pad = self.pad
+   if pad > 0 then
+      index = input:size(dim) - index + 2
+   else
+      pad = -pad
+   end
+   if index == 1 then
+      self.gradInput:copy(gradOutput:narrow(dim, 1 + pad, input:size(dim)))
+   elseif index == input:size(dim) + 1 then
+      self.gradInput:copy(gradOutput:narrow(dim, 1, input:size(dim)))
+   else
+      self.gradInput:narrow(dim, 1, index - 1):copy(gradOutput:narrow(dim, 1, index - 1))
+      self.gradInput:narrow(dim, index, input:size(dim) - (index - 1)):copy(gradOutput:narrow(dim, index + pad, input:size(dim) - (index - 1)))
+   end
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/PairwiseDistance.lua b/contrib/lua-torch/nn/PairwiseDistance.lua
new file mode 100644
index 000000000..99a502c16
--- /dev/null
+++ b/contrib/lua-torch/nn/PairwiseDistance.lua
@@ -0,0 +1,91 @@
+local PairwiseDistance, parent = torch.class('nn.PairwiseDistance', 'nn.Module')
+
+function PairwiseDistance:__init(p)
+   parent.__init(self)
+
+   -- state
+   self.gradInput = {}
+   self.diff = torch.Tensor()
+   self.norm = p or 2 -- Default using Euclidean distance
+end
+
+function PairwiseDistance:updateOutput(input)
+   self.output:resize(1)
+   if input[1]:dim() == 1 then
+      self.output:resize(1)
+      self.output[1]=input[1]:dist(input[2],self.norm)
+   elseif input[1]:dim() == 2 then
+      self.diff = self.diff or input[1].new()
+      self.diff:resizeAs(input[1])
+
+      local diff = self.diff:zero()
+      diff:add(input[1], -1, input[2])
+      diff:abs()
+
+      self.output:resize(input[1]:size(1))
+      self.output:zero()
+      self.output:add(diff:pow(self.norm):sum(2))
+      self.output:pow(1./self.norm)
+   else
+      error('input must be vector or matrix')
+   end
+
+   return self.output
+end
+
+local function mathsign(x)
+   if x==0 then return  2*torch.random(2)-3; end
+   if x>0 then return 1; else return -1; end
+end
+
+function PairwiseDistance:updateGradInput(input, gradOutput)
+   if input[1]:dim() > 2 then
+      error('input must be vector or matrix')
+   end
+
+   self.gradInput[1] = (self.gradInput[1] or input[1].new()):resize(input[1]:size())
+   self.gradInput[2] = (self.gradInput[2] or input[2].new()):resize(input[2]:size())
+   self.gradInput[1]:copy(input[1])
+   self.gradInput[1]:add(-1, input[2])
+
+   if self.norm==1 then
+     self.gradInput[1]:apply(mathsign)
+   else
+     -- Note: derivative of p-norm:
+     -- d/dx_k(||x||_p) = (x_k * abs(x_k)^(p-2)) / (||x||_p)^(p-1)
+     if (self.norm > 2) then
+        self.gradInput[1]:cmul(self.gradInput[1]:clone():abs():pow(self.norm-2))
+     end
+
+     if (input[1]:dim() > 1) then
+        self.outExpand = self.outExpand or self.output.new()
+        self.outExpand:resize(self.output:size(1), 1)
+        self.outExpand:copy(self.output)
+        self.outExpand:add(1.0e-6)  -- Prevent divide by zero errors
+        self.outExpand:pow(-(self.norm-1))
+        self.gradInput[1]:cmul(self.outExpand:expand(self.gradInput[1]:size(1),
+           self.gradInput[1]:size(2)))
+     else
+        self.gradInput[1]:mul(math.pow(self.output[1] + 1e-6, -(self.norm-1)))
+     end
+   end
+   if input[1]:dim() == 1 then
+      self.gradInput[1]:mul(gradOutput[1])
+   else
+      self.grad = self.grad or gradOutput.new()
+      self.ones = self.ones or gradOutput.new()
+
+      self.grad:resizeAs(input[1]):zero()
+      self.ones:resize(input[1]:size(2)):fill(1)
+
+      self.grad:addr(gradOutput, self.ones)
+      self.gradInput[1]:cmul(self.grad)
+   end
+   self.gradInput[2]:zero():add(-1, self.gradInput[1])
+   return self.gradInput
+end
+
+function PairwiseDistance:clearState()
+   nn.utils.clear(self, 'diff', 'outExpand', 'grad', 'ones')
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/Parallel.lua b/contrib/lua-torch/nn/Parallel.lua
new file mode 100644
index 000000000..58cb9748e
--- /dev/null
+++ b/contrib/lua-torch/nn/Parallel.lua
@@ -0,0 +1,116 @@
+local Parallel, parent = torch.class('nn.Parallel', 'nn.Container')
+
+function Parallel:__init(inputDimension,outputDimension)
+   parent.__init(self)
+   self.modules = {}
+   self.inputDimension = inputDimension
+   self.outputDimension = outputDimension
+end
+
+function Parallel:updateOutput(input)
+   local nModule=input:size(self.inputDimension)
+   local outputs = {}
+   self.totalOutputSize = self.totalOutputSize or torch.LongStorage()
+   local totalOutputSize = self.totalOutputSize
+
+   for i=1,nModule do
+      local currentInput = input:select(self.inputDimension,i)
+      local currentOutput = self:rethrowErrors(self.modules[i], i, 'updateOutput', currentInput)
+      table.insert(outputs, currentOutput)
+      local outputSize = currentOutput:size(self.outputDimension)
+
+      if i == 1 then
+         totalOutputSize:resize(currentOutput:dim()):copy(currentOutput:size())
+      else
+         totalOutputSize[self.outputDimension] = totalOutputSize[self.outputDimension] + outputSize
+      end
+
+   end
+   self.output:resize(totalOutputSize)
+
+   local offset = 1
+   for i=1,nModule do
+      local currentOutput = outputs[i]
+      local outputSize = currentOutput:size(self.outputDimension)
+      self.output:narrow(self.outputDimension, offset, outputSize):copy(currentOutput)
+      offset = offset + currentOutput:size(self.outputDimension)
+   end
+   return self.output
+end
+
+function Parallel:updateGradInput(input, gradOutput)
+   local nModule=input:size(self.inputDimension)
+   self.gradInput:resizeAs(input)
+
+   local offset = 1
+   for i=1,nModule do
+      local module=self.modules[i]
+      local currentInput = input:select(self.inputDimension,i)
+      local currentOutput = module.output
+      local outputSize = currentOutput:size(self.outputDimension)
+      local currentGradOutput = gradOutput:narrow(self.outputDimension, offset, outputSize)
+
+      local currentGradInput = self:rethrowErrors(module, i, 'updateGradInput', currentInput, currentGradOutput)
+
+      self.gradInput:select(self.inputDimension,i):copy(currentGradInput)
+      offset = offset + outputSize
+   end
+   return self.gradInput
+end
+
+function Parallel:accGradParameters(input, gradOutput, scale)
+   local nModule=input:size(self.inputDimension)
+
+   local offset = 1
+   for i=1,nModule do
+      local module = self.modules[i]
+      local currentOutput = module.output
+      local outputSize = currentOutput:size(self.outputDimension)
+
+      self:rethrowErrors(module, i, 'accGradParameters',
+          input:select(self.inputDimension,i),
+          gradOutput:narrow(self.outputDimension, offset,outputSize),
+          scale)
+
+      offset = offset + outputSize
+   end
+end
+
+function Parallel:accUpdateGradParameters(input, gradOutput, lr)
+   local nModule=input:size(self.inputDimension)
+
+   local offset = 1
+   for i=1,nModule do
+      local module = self.modules[i];
+      local currentOutput = module.output
+      self:rethrowErrors(module, i, 'accUpdateGradParameters',
+          input:select(self.inputDimension,i),
+          gradOutput:narrow(self.outputDimension, offset,
+                            currentOutput:size(self.outputDimension)),
+          lr)
+
+      offset = offset + currentOutput:size(self.outputDimension)
+   end
+end
+
+function Parallel:__tostring__()
+   local tab = '  '
+   local line = '\n'
+   local next = '  |`-> '
+   local lastNext = '   `-> '
+   local ext = '  |    '
+   local extlast = '       '
+   local last = '   ... -> '
+   local str = torch.type(self)
+   str = str .. ' {' .. line .. tab .. 'input'
+   for i=1,#self.modules do
+      if i == #self.modules then
+         str = str .. line .. tab .. lastNext .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab .. extlast)
+      else
+         str = str .. line .. tab .. next .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab .. ext)
+      end
+   end
+   str = str .. line .. tab .. last .. 'output'
+   str = str .. line .. '}'
+   return str
+end
diff --git a/contrib/lua-torch/nn/ParallelCriterion.lua b/contrib/lua-torch/nn/ParallelCriterion.lua
new file mode 100644
index 000000000..45607d5c3
--- /dev/null
+++ b/contrib/lua-torch/nn/ParallelCriterion.lua
@@ -0,0 +1,41 @@
+local ParallelCriterion, parent = torch.class('nn.ParallelCriterion', 'nn.Criterion')
+
+function ParallelCriterion:__init(repeatTarget)
+   parent.__init(self)
+   self.criterions = {}
+   self.weights = {}
+   self.gradInput = {}
+   self.repeatTarget = repeatTarget
+end
+
+function ParallelCriterion:add(criterion, weight)
+   assert(criterion, 'no criterion provided')
+   weight = weight or 1
+   table.insert(self.criterions, criterion)
+   table.insert(self.weights, weight)
+   return self
+end
+
+function ParallelCriterion:updateOutput(input, target)
+   self.output = 0
+   for i,criterion in ipairs(self.criterions) do
+      local target = self.repeatTarget and target or target[i]
+      self.output = self.output + self.weights[i]*criterion:updateOutput(input[i],target)
+   end
+   return self.output
+end
+
+function ParallelCriterion:updateGradInput(input, target)
+   self.gradInput = nn.utils.recursiveResizeAs(self.gradInput, input)
+   nn.utils.recursiveFill(self.gradInput, 0)
+   for i,criterion in ipairs(self.criterions) do
+      local target = self.repeatTarget and target or target[i]
+      nn.utils.recursiveAdd(self.gradInput[i], self.weights[i], criterion:updateGradInput(input[i], target))
+   end
+   return self.gradInput
+end
+
+function ParallelCriterion:type(type, tensorCache)
+   self.gradInput = {}
+   return parent.type(self, type, tensorCache)
+end
diff --git a/contrib/lua-torch/nn/ParallelTable.lua b/contrib/lua-torch/nn/ParallelTable.lua
new file mode 100644
index 000000000..2fe0899dd
--- /dev/null
+++ b/contrib/lua-torch/nn/ParallelTable.lua
@@ -0,0 +1,58 @@
+local ParallelTable, parent = torch.class('nn.ParallelTable', 'nn.Container')
+
+function ParallelTable:__init()
+   parent.__init(self)
+   self.modules = {}
+   self.output = {}
+   self.gradInput = {}
+end
+
+function ParallelTable:updateOutput(input)
+   for i=1,#self.modules do
+      self.output[i] = self:rethrowErrors(self.modules[i], i, 'updateOutput', input[i])
+   end
+   return self.output
+end
+
+function ParallelTable:updateGradInput(input, gradOutput)
+   for i,module in ipairs(self.modules) do
+      self.gradInput[i] = self:rethrowErrors(module, i, 'updateGradInput', input[i], gradOutput[i])
+   end
+   return self.gradInput
+end
+
+function ParallelTable:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   for i,module in ipairs(self.modules) do
+      self:rethrowErrors(module, i, 'accGradParameters', input[i], gradOutput[i], scale)
+   end
+end
+
+function ParallelTable:accUpdateGradParameters(input, gradOutput, lr)
+   lr = lr or 1
+   for i,module in ipairs(self.modules) do
+      self:rethrowErrors(module, i, 'accUpdateGradParameters', input[i], gradOutput[i], lr)
+   end
+end
+
+function ParallelTable:__tostring__()
+   local tab = '  '
+   local line = '\n'
+   local next = '  |`-> '
+   local lastNext = '   `-> '
+   local ext = '  |    '
+   local extlast = '       '
+   local last = '   ... -> '
+   local str = torch.type(self)
+   str = str .. ' {' .. line .. tab .. 'input'
+   for i=1,#self.modules do
+      if i == #self.modules then
+         str = str .. line .. tab .. lastNext .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab .. extlast)
+      else
+         str = str .. line .. tab .. next .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab .. ext)
+      end
+   end
+   str = str .. line .. tab .. last .. 'output'
+   str = str .. line .. '}'
+   return str
+end
diff --git a/contrib/lua-torch/nn/PartialLinear.lua b/contrib/lua-torch/nn/PartialLinear.lua
new file mode 100644
index 000000000..6e92cfc08
--- /dev/null
+++ b/contrib/lua-torch/nn/PartialLinear.lua
@@ -0,0 +1,114 @@
+local PartialLinear, Module = torch.class('nn.PartialLinear', 'nn.Module')
+
+--[[
+
+PartialLinear is a Linear layer that allows the user to a set a collection of
+column indices. When the column indices are set, the layer will behave like a
+Linear layer that only has those columns. Meanwhile, all parameters are
+preserved, so resetting the PartialLinear layer will result in a module that
+behaves just like a regular Linear layer.
+
+This module is useful, for instance, when you want to do forward-backward on
+only a subset of a Linear layer during training but use the full Linear layer
+at test time.
+
+]]--
+
+function PartialLinear:__init(inputsize, outputsize, bias)
+   local bias = ((bias == nil) and true) or bias
+   Module.__init(self)
+
+   -- define the layer as a small network:
+   local pt = nn.ParallelTable()
+   pt:add(nn.Identity()):add(nn.LookupTable(outputsize, inputsize))
+   self.network = nn.Sequential():add(pt):add(nn.MM(false, true))
+   if bias then
+      self.bias     = torch.Tensor(1, outputsize):zero()
+      self.gradBias = torch.Tensor(1, outputsize):zero()
+   end
+
+   -- set partition:
+   self.inputsize  = inputsize
+   self.outputsize = outputsize
+   self.allcolumns = torch.range(1, self.outputsize)
+   self:resetPartition()
+end
+
+function PartialLinear:setPartition(indices)
+   self.partition = indices:type(self.allcolumns:type())
+end
+
+function PartialLinear:resetPartition()
+   self.partition = self.allcolumns
+end
+
+function PartialLinear:parameters()
+   return {self.network:get(1):get(2).weight,     self.bias},
+          {self.network:get(1):get(2).gradWeight, self.gradBias}
+end  -- should return only the relevant partition?
+
+function PartialLinear:updateOutput(input)
+   self.output:set(self.network:forward{input, self.partition})
+   if self.bias then
+      self.output:add(
+         self.bias:index(2, self.partition:long()):expandAs(self.output)
+      )
+      self.addBuffer = self.addBuffer or input.new()
+      if self.addBuffer:nElement() ~= input:size(1) then
+         self.addBuffer:resize(input:size(1)):fill(1)
+      end
+   end
+   return self.output
+end
+
+function PartialLinear:updateGradInput(input, gradOutput)
+   if self.gradInput then
+      self.network:updateGradInput({input, self.partition}, gradOutput)
+      self.gradInput:set(self.network.gradInput[1])
+   end
+   return self.gradInput
+end
+
+function PartialLinear:accGradParameters(input, gradOutput, scale)
+   local scale = scale or 1
+   self.network:accGradParameters({input, self.partition}, gradOutput, scale)
+   if self.bias then
+      self.buffer = self.buffer or input.new()
+      self.buffer:resize(gradOutput:size(2))
+      self.buffer:mv(gradOutput:t(), self.addBuffer):mul(scale)
+      self.gradBias:indexAdd(
+         2, self.partition:long(), self.buffer:view(1, self.buffer:nElement())
+      )
+   end
+end
+
+function PartialLinear:accUpdateGradParameters(input, gradOutput, lr)
+   local gradWeight = self.network:get(1):get(2).gradWeight
+   local gradBias = self.gradBias
+   self.network:get(1):get(2).gradWeight = self.network:get(1):get(2).weight
+   self.gradBias = self.bias
+   self:accGradParameters(input, gradOutput, -lr)
+   self.network:get(1):get(2).gradWeight = gradWeight
+   self.gradBias = gradBias
+end
+
+function PartialLinear:zeroGradParameters()
+   self.network:zeroGradParameters()
+   self.gradBias:zero()
+end
+
+function PartialLinear:updateParameters(learningRate)
+   self.network:updateParameters(learningRate)
+   self.bias:add(-learningRate, self.gradBias)
+end
+
+function PartialLinear:sharedAccUpdateGradParameters(input, gradOutput, lr)
+   -- we do not need to accumulate parameters when sharing:
+   self:defaultAccUpdateGradParameters(input, gradOutput, lr)
+end
+
+function PartialLinear:__tostring__()
+   return torch.type(self) ..
+      string.format('(%d -> %d)', self.inputsize, self.outputsize) ..
+      (self.bias == nil and ' without bias' or '')
+end
diff --git a/contrib/lua-torch/nn/PixelShuffle.lua b/contrib/lua-torch/nn/PixelShuffle.lua
new file mode 100644
index 000000000..dd58ed948
--- /dev/null
+++ b/contrib/lua-torch/nn/PixelShuffle.lua
@@ -0,0 +1,111 @@
+local PixelShuffle, parent = torch.class("nn.PixelShuffle", "nn.Module")
+
+-- Shuffles pixels after upscaling with a ESPCNN model
+-- Converts a [batch x channel*r^2 x m x p] tensor to [batch x channel x r*m x r*p]
+-- tensor, where r is the upscaling factor.
+-- @param upscaleFactor - the upscaling factor to use
+function PixelShuffle:__init(upscaleFactor)
+   parent.__init(self)
+   self.upscaleFactor = upscaleFactor
+   self.upscaleFactorSquared = self.upscaleFactor * self.upscaleFactor
+end
+
+-- Computes the forward pass of the layer i.e. Converts a
+-- [batch x channel*r^2 x m x p] tensor to [batch x channel x r*m x r*p] tensor.
+-- @param input - the input tensor to be shuffled of size [b x c*r^2 x m x p]
+-- @return output - the shuffled tensor of size [b x c x r*m x r*p]
+function PixelShuffle:updateOutput(input)
+   self._intermediateShape = self._intermediateShape or torch.LongStorage(6)
+   self._outShape = self.outShape or torch.LongStorage()
+   self._shuffleOut = self._shuffleOut or input.new()
+
+   local batched = false
+   local batchSize = 1
+   local inputStartIdx = 1
+   local outShapeIdx = 1
+   if input:nDimension() == 4 then
+      batched = true
+      batchSize = input:size(1)
+      inputStartIdx = 2
+      outShapeIdx = 2
+      self._outShape:resize(4)
+      self._outShape[1] = batchSize
+   else
+      self._outShape:resize(3)
+   end
+
+   --input is of size h/r w/r, rc output should be h, r, c
+   local channels = input:size(inputStartIdx) / self.upscaleFactorSquared
+   local inHeight = input:size(inputStartIdx + 1)
+   local inWidth = input:size(inputStartIdx + 2)
+
+   self._intermediateShape[1] = batchSize
+   self._intermediateShape[2] = channels
+   self._intermediateShape[3] = self.upscaleFactor
+   self._intermediateShape[4] = self.upscaleFactor
+   self._intermediateShape[5] = inHeight
+   self._intermediateShape[6] = inWidth
+
+   self._outShape[outShapeIdx] = channels
+   self._outShape[outShapeIdx + 1] = inHeight * self.upscaleFactor
+   self._outShape[outShapeIdx + 2] = inWidth * self.upscaleFactor
+
+   local inputView = torch.view(input, self._intermediateShape)
+
+   self._shuffleOut:resize(inputView:size(1), inputView:size(2), inputView:size(5),
+                           inputView:size(3), inputView:size(6), inputView:size(4))
+   self._shuffleOut:copy(inputView:permute(1, 2, 5, 3, 6, 4))
+
+   self.output = torch.view(self._shuffleOut, self._outShape)
+
+   return self.output
+end
+
+-- Computes the backward pass of the layer, given the gradient w.r.t. the output
+-- this function computes the gradient w.r.t. the input.
+-- @param input - the input tensor of shape [b x c*r^2 x m x p]
+-- @param gradOutput - the tensor with the gradients w.r.t. output of shape [b x c x r*m x r*p]
+-- @return gradInput - a tensor of the same shape as input, representing the gradient w.r.t. input.
+function PixelShuffle:updateGradInput(input, gradOutput)
+   self._intermediateShape = self._intermediateShape or torch.LongStorage(6)
+   self._shuffleIn = self._shuffleIn or input.new()
+
+   local batchSize = 1
+   local inputStartIdx = 1
+   if input:nDimension() == 4 then
+      batchSize = input:size(1)
+      inputStartIdx = 2
+   end
+
+   local channels = input:size(inputStartIdx) / self.upscaleFactorSquared
+   local height = input:size(inputStartIdx + 1)
+   local width = input:size(inputStartIdx + 2)
+
+   self._intermediateShape[1] = batchSize
+   self._intermediateShape[2] = channels
+   self._intermediateShape[3] = height
+   self._intermediateShape[4] = self.upscaleFactor
+   self._intermediateShape[5] = width
+   self._intermediateShape[6] = self.upscaleFactor
+
+   local gradOutputView = torch.view(gradOutput, self._intermediateShape)
+
+   self._shuffleIn:resize(gradOutputView:size(1), gradOutputView:size(2), gradOutputView:size(4),
+                          gradOutputView:size(6), gradOutputView:size(3), gradOutputView:size(5))
+   self._shuffleIn:copy(gradOutputView:permute(1, 2, 4, 6, 3, 5))
+
+   self.gradInput = torch.view(self._shuffleIn, input:size())
+
+   return self.gradInput
+end
+
+
+function PixelShuffle:clearState()
+   nn.utils.clear(self, {
+      "_intermediateShape",
+      "_outShape",
+      "_shuffleIn",
+      "_shuffleOut",
+   })
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/Power.lua b/contrib/lua-torch/nn/Power.lua
new file mode 100644
index 000000000..771183c48
--- /dev/null
+++ b/contrib/lua-torch/nn/Power.lua
@@ -0,0 +1,22 @@
+local Power, parent = torch.class('nn.Power','nn.Module')
+
+function Power:__init(p)
+   parent.__init(self)
+   self.pow = p
+   if not p then
+      error('nn.Power(power)')
+   end
+end
+
+function Power:updateOutput(input)
+   self.output:resizeAs(input):copy(input)
+   self.output:pow(self.pow)
+   return self.output
+end
+
+function Power:updateGradInput(input, gradOutput)
+   self.gradInput:resizeAs(input):copy(input)
+   self.gradInput:pow(self.pow - 1)
+   self.gradInput:cmul(gradOutput):mul(self.pow)
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/PrintSize.lua b/contrib/lua-torch/nn/PrintSize.lua
new file mode 100644
index 000000000..d8dc91bff
--- /dev/null
+++ b/contrib/lua-torch/nn/PrintSize.lua
@@ -0,0 +1,36 @@
+local PrintSize, parent = torch.class('nn.PrintSize', 'nn.Module')
+
+function PrintSize:__init(prefix)
+   parent.__init(self)
+   self.prefix = prefix or "PrintSize"
+end
+
+function PrintSize:updateOutput(input)
+   self.output = input
+   local size
+   if torch.type(input) == 'table' then
+      size = input
+   elseif torch.type(input) == 'nil' then
+      size = 'missing size'
+   else
+      size = input:size()
+   end
+   print(self.prefix..":input\n", size)
+   return self.output
+end
+
+
+function PrintSize:updateGradInput(input, gradOutput)
+   local size
+   if torch.type(gradOutput) == 'table' then
+      size = gradOutput
+   elseif torch.type(gradOutput) == 'nil' then
+      size = 'missing size'
+   else
+      size = gradOutput:size()
+   end
+   print(self.prefix..":gradOutput\n", size)
+   self.gradInput = gradOutput
+   return self.gradInput
+end
+
diff --git a/contrib/lua-torch/nn/Profile.lua b/contrib/lua-torch/nn/Profile.lua
new file mode 100644
index 000000000..36cd909cd
--- /dev/null
+++ b/contrib/lua-torch/nn/Profile.lua
@@ -0,0 +1,55 @@
+local ProfileModule, parent = torch.class("nn.Profile", "nn.Decorator")
+
+function ProfileModule:__init(module, print_interval, name)
+   parent.__init(self, module)
+   self.print_interval = print_interval or 100
+   self.name = name or torch.type(module)
+   self.module = module
+   self.numFwds = 0
+   self.numBwds = 0
+   self.summedFwdTime = 0
+   self.summedBwdTime = 0
+   self.timer = torch.Timer()
+end
+
+function ProfileModule:updateOutput(input)
+   self.timer:reset()
+   self.output = self.module:updateOutput(input)
+   self.summedFwdTime = self.summedFwdTime + self.timer:time().real
+   self.numFwds = self.numFwds + 1
+   if self.numFwds % self.print_interval == 0 then
+      print (string.format('%s took %.3f seconds for %d forward passes',
+         self.name, self.summedFwdTime, self.print_interval))
+      self.numFwds = 0
+      self.summedFwdTime = 0
+   end
+   return self.output
+end
+
+function ProfileModule:updateGradInput(input, gradOutput)
+   self.timer:reset()
+   self.gradInput = self.module:updateGradInput(input, gradOutput)
+   self.summedBwdTime = self.summedBwdTime + self.timer:time().real
+   self.numBwds = self.numBwds + 1
+   if self.numBwds % self.print_interval == 0 then
+      print (string.format('%s took %.3f seconds for %d backward passes',
+         self.name, self.summedBwdTime, self.print_interval))
+      self.numBwds = 0
+      self.summedBwdTime = 0
+   end
+   return self.gradInput
+end
+
+local function makeTorchTimerSerializable()
+   -- The Timer object part of this class needs to be serializable
+   -- so that the layer can be saved, cloned, etc. We add a dummy
+   -- serialization of torch.Timer that just creates a new instance at read
+   local timerMetatable = getmetatable(torch.Timer())
+   timerMetatable['__factory'] = torch.Timer
+   timerMetatable['write'] = function(object, file) end
+   timerMetatable['read'] = function(object, file, versionNumber)
+      return object
+   end
+end
+
+makeTorchTimerSerializable()
diff --git a/contrib/lua-torch/nn/README.md b/contrib/lua-torch/nn/README.md
new file mode 100644
index 000000000..6efd60962
--- /dev/null
+++ b/contrib/lua-torch/nn/README.md
@@ -0,0 +1,21 @@
+[![Build Status](https://travis-ci.org/torch/nn.svg?branch=master)](https://travis-ci.org/torch/nn)
+<a name="nn.dok"></a>
+# Neural Network Package #
+
+This package provides an easy and modular way to build and train simple or complex neural networks using [Torch](https://github.com/torch/torch7/blob/master/README.md):
+ * Modules are the bricks used to build neural networks. Each are themselves neural networks, but can be combined with other networks using containers to create complex neural networks:
+   * [Module](doc/module.md#nn.Module): abstract class inherited by all modules;
+   * [Containers](doc/containers.md#nn.Containers): composite and decorator classes like [`Sequential`](doc/containers.md#nn.Sequential), [`Parallel`](doc/containers.md#nn.Parallel), [`Concat`](doc/containers.md#nn.Concat) and [`NaN`](doc/containers.md#nn.NaN);
+   * [Transfer functions](doc/transfer.md#nn.transfer.dok): non-linear functions like [`Tanh`](doc/transfer.md#nn.Tanh) and [`Sigmoid`](doc/transfer.md#nn.Sigmoid);
+   * [Simple layers](doc/simple.md#nn.simplelayers.dok): like [`Linear`](doc/simple.md#nn.Linear), [`Mean`](doc/simple.md#nn.Mean), [`Max`](doc/simple.md#nn.Max) and [`Reshape`](doc/simple.md#nn.Reshape);
+   * [Table layers](doc/table.md#nn.TableLayers): layers for manipulating `table`s like [`SplitTable`](doc/table.md#nn.SplitTable), [`ConcatTable`](doc/table.md#nn.ConcatTable) and [`JoinTable`](doc/table.md#nn.JoinTable);
+   * [Convolution layers](doc/convolution.md#nn.convlayers.dok): [`Temporal`](doc/convolution.md#nn.TemporalModules),  [`Spatial`](doc/convolution.md#nn.SpatialModules) and [`Volumetric`](doc/convolution.md#nn.VolumetricModules) convolutions;
+ * Criterions compute a gradient according to a given loss function given an input and a target:
+   * [Criterions](doc/criterion.md#nn.Criterions): a list of all criterions, including [`Criterion`](doc/criterion.md#nn.Criterion), the abstract class;
+   * [`MSECriterion`](doc/criterion.md#nn.MSECriterion): the Mean Squared Error criterion used for regression;
+   * [`ClassNLLCriterion`](doc/criterion.md#nn.ClassNLLCriterion): the Negative Log Likelihood criterion used for classification;
+ * Additional documentation:
+   * [Overview](doc/overview.md#nn.overview.dok) of the package essentials including modules, containers and training;
+   * [Training](doc/training.md#nn.traningneuralnet.dok): how to train a neural network using [`StochasticGradient`](doc/training.md#nn.StochasticGradient);
+   * [Testing](doc/testing.md): how to test your modules.
+   * [Experimental Modules](https://github.com/clementfarabet/lua---nnx/blob/master/README.md): a package containing experimental modules and criteria.
diff --git a/contrib/lua-torch/nn/RReLU.lua b/contrib/lua-torch/nn/RReLU.lua
new file mode 100644
index 000000000..843415f7e
--- /dev/null
+++ b/contrib/lua-torch/nn/RReLU.lua
@@ -0,0 +1,50 @@
+local ffi = require 'ffi'
+local RReLU, parent = torch.class('nn.RReLU', 'nn.Module')
+
+function RReLU:__init(l, u, ip)
+   parent.__init(self)
+   self.lower = l or 1/8
+   self.upper = u or 1/3
+   assert(self.lower <= self.upper and self.lower >= 0 and self.upper >= 0)
+   self.noise = torch.Tensor()
+   self.train = true
+   self.inplace = ip or false
+end
+
+function RReLU:updateOutput(input)
+   local gen = ffi.typeof('THGenerator**')(torch._gen)[0]
+   input.THNN.RReLU_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.noise:cdata(),
+      self.lower,
+      self.upper,
+      self.train,
+      self.inplace,
+      gen
+   )
+   return self.output
+end
+
+function RReLU:updateGradInput(input, gradOutput)
+   input.THNN.RReLU_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.noise:cdata(),
+      self.lower,
+      self.upper,
+      self.train,
+      self.inplace
+   )
+   return self.gradInput
+end
+
+function RReLU:__tostring__()
+  return string.format('%s (l:%f, u:%f)', torch.type(self), self.lower, self.upper)
+end
+
+function RReLU:clearState()
+   if self.noise then self.noise:set() end
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/ReLU.lua b/contrib/lua-torch/nn/ReLU.lua
new file mode 100644
index 000000000..a6eb271ee
--- /dev/null
+++ b/contrib/lua-torch/nn/ReLU.lua
@@ -0,0 +1,5 @@
+local ReLU, Parent = torch.class('nn.ReLU', 'nn.Threshold')
+
+function ReLU:__init(p)
+   Parent.__init(self,0,0,p)
+end
diff --git a/contrib/lua-torch/nn/ReLU6.lua b/contrib/lua-torch/nn/ReLU6.lua
new file mode 100644
index 000000000..1cde00b46
--- /dev/null
+++ b/contrib/lua-torch/nn/ReLU6.lua
@@ -0,0 +1,32 @@
+local ReLU6, parent = torch.class('nn.ReLU6', 'nn.Module')
+
+function ReLU6:__init(inplace)
+   parent.__init(self)
+
+   if inplace == nil then
+      self.inplace = false
+   else
+      self.inplace = inplace
+   end
+
+   if (inplace and type(inplace) ~= 'boolean') then
+      error('in-place flag must be boolean')
+   end
+end
+
+function ReLU6:updateOutput(input)
+   input.THNN.HardTanh_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      0, 6, self.inplace)
+   return self.output
+end
+
+function ReLU6:updateGradInput(input, gradOutput)
+   input.THNN.HardTanh_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      0, 6, self.inplace)
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/Replicate.lua b/contrib/lua-torch/nn/Replicate.lua
new file mode 100644
index 000000000..c7dedd767
--- /dev/null
+++ b/contrib/lua-torch/nn/Replicate.lua
@@ -0,0 +1,57 @@
+local Replicate, parent = torch.class('nn.Replicate','nn.Module')
+
+function Replicate:__init(nf, dim, ndim)
+   parent.__init(self)
+   self.nfeatures = nf
+   self.dim = dim or 1
+   self.ndim = ndim
+   assert(self.dim > 0, "Can only replicate across positive integer dimensions.")
+end
+
+function Replicate:updateOutput(input)
+   self.dim = self.dim or 1 --backwards compatible
+   assert(
+      self.dim <= input:dim()+1,
+      "Not enough input dimensions to replicate along dimension " ..
+      tostring(self.dim) .. ".")
+   local batchOffset = self.ndim and input:dim() > self.ndim and 1 or 0
+   local rdim = self.dim + batchOffset
+   local sz = torch.LongStorage(input:dim()+1)
+   sz[rdim] = self.nfeatures
+   for i = 1,input:dim() do
+      local offset = 0
+      if i >= rdim then
+         offset = 1
+      end
+      sz[i+offset] = input:size(i)
+   end
+   local st = torch.LongStorage(input:dim()+1)
+   st[rdim] = 0
+   for i = 1,input:dim() do
+      local offset = 0
+      if i >= rdim then
+         offset = 1
+      end
+      st[i+offset] = input:stride(i)
+   end
+   self.output:set(input:storage(),input:storageOffset(),sz,st)
+   return self.output
+end
+
+function Replicate:updateGradInput(input, gradOutput)
+   self.gradInput:resizeAs(input):zero()
+   local batchOffset = self.ndim and input:dim() > self.ndim and 1 or 0
+   local rdim = self.dim + batchOffset
+   local sz = torch.LongStorage(input:dim()+1)
+   sz[rdim] = 1
+   for i = 1,input:dim() do
+      local offset = 0
+      if i >= rdim then
+         offset = 1
+      end
+      sz[i+offset] = input:size(i)
+   end
+   local gradInput = self.gradInput:view(sz)
+   gradInput:sum(gradOutput, rdim)
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/Reshape.lua b/contrib/lua-torch/nn/Reshape.lua
new file mode 100644
index 000000000..d508369fa
--- /dev/null
+++ b/contrib/lua-torch/nn/Reshape.lua
@@ -0,0 +1,72 @@
+local Reshape, parent = torch.class('nn.Reshape', 'nn.Module')
+
+function Reshape:__init(...)
+   parent.__init(self)
+   local arg = {...}
+
+   self.size = torch.LongStorage()
+   self.batchsize = torch.LongStorage()
+   if torch.type(arg[#arg]) == 'boolean' then
+      self.batchMode = arg[#arg]
+      table.remove(arg, #arg)
+   end
+   local n = #arg
+   if n == 1 and torch.typename(arg[1]) == 'torch.LongStorage' then
+      self.size:resize(#arg[1]):copy(arg[1])
+   else
+      self.size:resize(n)
+      for i=1,n do
+         self.size[i] = arg[i]
+      end
+   end
+
+   self.nelement = 1
+   self.batchsize:resize(#self.size+1)
+   for i=1,#self.size do
+      self.nelement = self.nelement * self.size[i]
+      self.batchsize[i+1] = self.size[i]
+   end
+end
+
+function Reshape:updateOutput(input)
+   if not input:isContiguous() then
+      self._input = self._input or input.new()
+      self._input:resizeAs(input)
+      self._input:copy(input)
+      input = self._input
+   end
+
+   if (self.batchMode == false) or (
+         (self.batchMode == nil) and
+         (input:nElement() == self.nelement and input:size(1) ~= 1)
+      ) then
+      self.output:view(input, self.size)
+   else
+      self.batchsize[1] = input:size(1)
+      self.output:view(input, self.batchsize)
+   end
+   return self.output
+end
+
+function Reshape:updateGradInput(input, gradOutput)
+   if not gradOutput:isContiguous() then
+      self._gradOutput = self._gradOutput or gradOutput.new()
+      self._gradOutput:resizeAs(gradOutput)
+      self._gradOutput:copy(gradOutput)
+      gradOutput = self._gradOutput
+   end
+
+   self.gradInput:viewAs(gradOutput, input)
+   return self.gradInput
+end
+
+
+function Reshape:__tostring__()
+  return torch.type(self) .. '(' ..
+      table.concat(self.size:totable(), 'x') .. ')'
+end
+
+function Reshape:clearState()
+   nn.utils.clear(self, '_input', '_gradOutput')
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/Select.lua b/contrib/lua-torch/nn/Select.lua
new file mode 100644
index 000000000..be87c6465
--- /dev/null
+++ b/contrib/lua-torch/nn/Select.lua
@@ -0,0 +1,24 @@
+local Select, parent = torch.class('nn.Select', 'nn.Module')
+
+function Select:__init(dimension,index)
+   parent.__init(self)
+   self.dimension = dimension
+   self.index = index
+end
+
+function Select:updateOutput(input)
+   local dim = self.dimension < 0 and input:dim() + self.dimension + 1 or self.dimension
+   local index = self.index < 0 and input:size(dim) + self.index + 1 or self.index
+   local output = input:select(dim, index);
+   self.output:resizeAs(output)
+   return self.output:copy(output)
+end
+
+function Select:updateGradInput(input, gradOutput)
+   local dim = self.dimension < 0 and input:dim() + self.dimension + 1 or self.dimension
+   local index = self.index < 0 and input:size(dim) + self.index + 1 or self.index
+   self.gradInput:resizeAs(input)
+   self.gradInput:zero()
+   self.gradInput:select(dim,index):copy(gradOutput)
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/SelectTable.lua b/contrib/lua-torch/nn/SelectTable.lua
new file mode 100644
index 000000000..ef26f3507
--- /dev/null
+++ b/contrib/lua-torch/nn/SelectTable.lua
@@ -0,0 +1,71 @@
+local SelectTable, parent = torch.class('nn.SelectTable', 'nn.Module')
+
+function SelectTable:__init(index)
+   parent.__init(self)
+   self.index = index
+   self.gradInput = {}
+end
+
+function SelectTable:updateOutput(input)
+
+   -- handle negative indices
+   local index = self.index
+   if type(index) == "number" then
+      index = index < 0 and #input + index + 1 or index
+   end
+
+   assert(input[index], "index does not exist in the input table")
+   self.output = input[index]
+
+   return self.output
+end
+
+local function zeroTableCopy(t1, t2)
+   for k, v in pairs(t2) do
+      if (torch.type(v) == "table") then
+         t1[k] = zeroTableCopy(t1[k] or {}, t2[k])
+      elseif torch.isTensor(v) then
+         if not t1[k] then
+            t1[k] = v:clone():zero()
+         else
+            t1[k]:resizeAs(v)
+            t1[k]:zero()
+         end
+      else
+        t1[k] = nil
+      end
+   end
+   for k, v in pairs(t1) do
+      if not t2[k] then
+         t1[k] = nil
+      end
+   end
+   return t1
+end
+
+function SelectTable:updateGradInput(input, gradOutput)
+   -- make gradInput a zeroed copy of input
+   zeroTableCopy(self.gradInput, input)
+   -- handle negative indices
+   local index = self.index
+   if type(index) == "number" then
+      index = index < 0 and #input + index + 1 or index
+   end
+   -- copy into gradInput[index] (necessary for variable sized inputs)
+   assert(self.gradInput[index])
+   nn.utils.recursiveCopy(self.gradInput[index], gradOutput)
+
+   return self.gradInput
+end
+
+function SelectTable:type(type, tensorCache)
+   self.gradInput = {}
+   self.output = {}
+   return parent.type(self, type, tensorCache)
+end
+
+function SelectTable:__tostring__()
+  return torch.type(self) .. '(' .. self.index .. ')'
+end
+
+SelectTable.clearState = nn.Identity.clearState
diff --git a/contrib/lua-torch/nn/Sequential.lua b/contrib/lua-torch/nn/Sequential.lua
new file mode 100644
index 000000000..22b0886b8
--- /dev/null
+++ b/contrib/lua-torch/nn/Sequential.lua
@@ -0,0 +1,122 @@
+local Sequential, _ = torch.class('nn.Sequential', 'nn.Container')
+
+function Sequential:__len()
+   return #self.modules
+end
+
+function Sequential:add(module)
+   if #self.modules == 0 then
+      self.gradInput = module.gradInput
+   end
+   table.insert(self.modules, module)
+   self.output = module.output
+   return self
+end
+
+function Sequential:insert(module, index)
+   index = index or (#self.modules + 1)
+   if index > (#self.modules + 1) or index < 1 then
+      error"index should be contiguous to existing modules"
+   end
+   table.insert(self.modules, index, module)
+   self.output = self.modules[#self.modules].output
+   self.gradInput = self.modules[1].gradInput
+end
+
+function Sequential:remove(index)
+   index = index or #self.modules
+   if index > #self.modules or index < 1 then
+      error"index out of range"
+   end
+   table.remove(self.modules, index)
+   if #self.modules > 0 then
+       self.output = self.modules[#self.modules].output
+       self.gradInput = self.modules[1].gradInput
+   else
+       self.output = torch.Tensor()
+       self.gradInput = torch.Tensor()
+   end
+end
+
+function Sequential:updateOutput(input)
+   local currentOutput = input
+   for i=1,#self.modules do
+      currentOutput = self:rethrowErrors(self.modules[i], i, 'updateOutput', currentOutput)
+   end
+   self.output = currentOutput
+   return currentOutput
+end
+
+function Sequential:updateGradInput(input, gradOutput)
+   local currentGradOutput = gradOutput
+   local currentModule = self.modules[#self.modules]
+   for i=#self.modules-1,1,-1 do
+      local previousModule = self.modules[i]
+      currentGradOutput = self:rethrowErrors(currentModule, i+1, 'updateGradInput', previousModule.output, currentGradOutput)
+      currentModule = previousModule
+   end
+   currentGradOutput = self:rethrowErrors(currentModule, 1, 'updateGradInput', input, currentGradOutput)
+   self.gradInput = currentGradOutput
+   return currentGradOutput
+end
+
+function Sequential:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+
+   local currentGradOutput = gradOutput
+   local currentModule = self.modules[#self.modules]
+   for i=#self.modules-1,1,-1 do
+      local previousModule = self.modules[i]
+      self:rethrowErrors(currentModule, i+1, 'accGradParameters', previousModule.output, currentGradOutput, scale)
+      currentGradOutput = currentModule.gradInput
+      currentModule = previousModule
+   end
+
+   self:rethrowErrors(currentModule, 1, 'accGradParameters', input, currentGradOutput, scale)
+end
+
+function Sequential:backward(input, gradOutput, scale)
+   scale = scale or 1
+   local currentGradOutput = gradOutput
+   local currentModule = self.modules[#self.modules]
+   for i=#self.modules-1,1,-1 do
+      local previousModule = self.modules[i]
+      currentGradOutput = self:rethrowErrors(currentModule, i+1, 'backward', previousModule.output, currentGradOutput, scale)
+      currentModule.gradInput = currentGradOutput
+      currentModule = previousModule
+   end
+   currentGradOutput = self:rethrowErrors(currentModule, 1, 'backward', input, currentGradOutput, scale)
+   self.gradInput = currentGradOutput
+   return currentGradOutput
+end
+
+function Sequential:accUpdateGradParameters(input, gradOutput, lr)
+   local currentGradOutput = gradOutput
+   local currentModule = self.modules[#self.modules]
+   for i=#self.modules-1,1,-1 do
+      local previousModule = self.modules[i]
+      self:rethrowErrors(currentModule, i+1, 'accUpdateGradParameters', previousModule.output, currentGradOutput, lr)
+      currentGradOutput = currentModule.gradInput
+      currentModule = previousModule
+   end
+
+   self:rethrowErrors(currentModule, 1, 'accUpdateGradParameters', input, currentGradOutput, lr)
+end
+
+
+function Sequential:__tostring__()
+   local tab = '  '
+   local line = '\n'
+   local next = ' -> '
+   local str = 'nn.Sequential'
+   str = str .. ' {' .. line .. tab .. '[input'
+   for i=1,#self.modules do
+      str = str .. next .. '(' .. i .. ')'
+   end
+   str = str .. next .. 'output]'
+   for i=1,#self.modules do
+      str = str .. line .. tab .. '(' .. i .. '): ' .. tostring(self.modules[i]):gsub(line, line .. tab)
+   end
+   str = str .. line .. '}'
+   return str
+end
diff --git a/contrib/lua-torch/nn/Sigmoid.lua b/contrib/lua-torch/nn/Sigmoid.lua
new file mode 100644
index 000000000..0126f6f8f
--- /dev/null
+++ b/contrib/lua-torch/nn/Sigmoid.lua
@@ -0,0 +1,19 @@
+local Sigmoid = torch.class('nn.Sigmoid', 'nn.Module')
+
+function Sigmoid:updateOutput(input)
+   input.THNN.Sigmoid_updateOutput(
+      input:cdata(),
+      self.output:cdata()
+   )
+   return self.output
+end
+
+function Sigmoid:updateGradInput(input, gradOutput)
+   input.THNN.Sigmoid_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.output:cdata()
+   )
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/SmoothL1Criterion.lua b/contrib/lua-torch/nn/SmoothL1Criterion.lua
new file mode 100644
index 000000000..be636a94c
--- /dev/null
+++ b/contrib/lua-torch/nn/SmoothL1Criterion.lua
@@ -0,0 +1,32 @@
+local SmoothL1Criterion, parent = torch.class('nn.SmoothL1Criterion', 'nn.Criterion')
+
+function SmoothL1Criterion:__init(sizeAverage)
+   parent.__init(self)
+   if sizeAverage ~= nil then
+     self.sizeAverage = sizeAverage
+   else
+     self.sizeAverage = true
+   end
+end
+
+function SmoothL1Criterion:updateOutput(input, target)
+   self.output_tensor = self.output_tensor or input.new(1)
+   input.THNN.SmoothL1Criterion_updateOutput(
+      input:cdata(),
+      target:cdata(),
+      self.output_tensor:cdata(),
+      self.sizeAverage
+   )
+   self.output = self.output_tensor[1]
+   return self.output
+end
+
+function SmoothL1Criterion:updateGradInput(input, target)
+   input.THNN.SmoothL1Criterion_updateGradInput(
+      input:cdata(),
+      target:cdata(),
+      self.gradInput:cdata(),
+      self.sizeAverage
+   )
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/SoftMarginCriterion.lua b/contrib/lua-torch/nn/SoftMarginCriterion.lua
new file mode 100644
index 000000000..96ccda8a4
--- /dev/null
+++ b/contrib/lua-torch/nn/SoftMarginCriterion.lua
@@ -0,0 +1,24 @@
+local SoftMarginCriterion, parent = torch.class('nn.SoftMarginCriterion', 'nn.Criterion')
+
+function SoftMarginCriterion:__init()
+   parent.__init(self)
+   self.sizeAverage = true
+end
+
+function SoftMarginCriterion:updateOutput(input, target)
+   self.output_tensor = self.output_tensor or input.new(1)
+   input.THNN.SoftMarginCriterion_updateOutput(
+      input:cdata(), target:cdata(),
+      self.output_tensor:cdata(),
+      self.sizeAverage)
+   self.output = self.output_tensor[1]
+   return self.output
+end
+
+function SoftMarginCriterion:updateGradInput(input, target)
+   input.THNN.SoftMarginCriterion_updateGradInput(
+      input:cdata(), target:cdata(),
+      self.gradInput:cdata(),
+      self.sizeAverage)
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/SoftMax.lua b/contrib/lua-torch/nn/SoftMax.lua
new file mode 100644
index 000000000..23a444cf6
--- /dev/null
+++ b/contrib/lua-torch/nn/SoftMax.lua
@@ -0,0 +1,19 @@
+local SoftMax, _ = torch.class('nn.SoftMax', 'nn.Module')
+
+function SoftMax:updateOutput(input)
+   input.THNN.SoftMax_updateOutput(
+      input:cdata(),
+      self.output:cdata()
+   )
+   return self.output
+end
+
+function SoftMax:updateGradInput(input, gradOutput)
+   input.THNN.SoftMax_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.output:cdata()
+   )
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/SoftMin.lua b/contrib/lua-torch/nn/SoftMin.lua
new file mode 100644
index 000000000..7da2a6589
--- /dev/null
+++ b/contrib/lua-torch/nn/SoftMin.lua
@@ -0,0 +1,31 @@
+local SoftMin, parent = torch.class('nn.SoftMin', 'nn.Module')
+
+function SoftMin:updateOutput(input)
+   self.mininput = self.mininput or input.new()
+   self.mininput:resizeAs(input):copy(input):mul(-1)
+   input.THNN.SoftMax_updateOutput(
+      self.mininput:cdata(),
+      self.output:cdata()
+   )
+   return self.output
+end
+
+function SoftMin:updateGradInput(input, gradOutput)
+   self.mininput = self.mininput or input.new()
+   self.mininput:resizeAs(input):copy(input):mul(-1)
+
+   input.THNN.SoftMax_updateGradInput(
+      self.mininput:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.output:cdata()
+   )
+
+   self.gradInput:mul(-1)
+   return self.gradInput
+end
+
+function SoftMin:clearState()
+   if self.mininput then self.mininput:set() end
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/SoftPlus.lua b/contrib/lua-torch/nn/SoftPlus.lua
new file mode 100644
index 000000000..f77b25380
--- /dev/null
+++ b/contrib/lua-torch/nn/SoftPlus.lua
@@ -0,0 +1,35 @@
+local SoftPlus, parent = torch.class('nn.SoftPlus', 'nn.Module')
+
+function SoftPlus:__init(beta)
+   parent.__init(self)
+   self.beta = beta or 1  -- Beta controls sharpness of transfer function
+   self.threshold = 20    -- Avoid floating point issues with exp(x), x>20
+end
+
+function SoftPlus:updateOutput(input)
+   -- f(x) = 1/beta * log(1 + exp(beta * x))
+   input.THNN.SoftPlus_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.beta,
+      self.threshold
+   )
+   return self.output
+end
+
+function SoftPlus:updateGradInput(input, gradOutput)
+   -- d/dx[log(1+exp(k*x))/k] = exp(kx) / (exp(kx) + 1)
+   -- SINCE
+   -- y = (1/k)*log(1+exp(k*x)) --> x = (1/k)*log(exp(k*y)-1)
+   -- THEREFORE:
+   -- d/dx(f(x)) = (exp(k*y) - 1) / exp(k*y)
+   input.THNN.SoftPlus_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.output:cdata(),
+      self.beta,
+      self.threshold
+   )
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/SoftShrink.lua b/contrib/lua-torch/nn/SoftShrink.lua
new file mode 100644
index 000000000..67af15a98
--- /dev/null
+++ b/contrib/lua-torch/nn/SoftShrink.lua
@@ -0,0 +1,25 @@
+local SoftShrink, parent = torch.class('nn.SoftShrink', 'nn.Module')
+
+function SoftShrink:__init(lam)
+   parent.__init(self)
+   self.lambda = lam or 0.5
+end
+
+function SoftShrink:updateOutput(input)
+   input.THNN.SoftShrink_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.lambda
+   )
+   return self.output
+end
+
+function SoftShrink:updateGradInput(input, gradOutput)
+   input.THNN.SoftShrink_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.lambda
+   )
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/SoftSign.lua b/contrib/lua-torch/nn/SoftSign.lua
new file mode 100644
index 000000000..ee72011f1
--- /dev/null
+++ b/contrib/lua-torch/nn/SoftSign.lua
@@ -0,0 +1,20 @@
+local SoftSign, parent = torch.class('nn.SoftSign', 'nn.Module')
+
+function SoftSign:updateOutput(input)
+   self.temp = self.temp or input.new()
+   self.temp:resizeAs(input):copy(input):abs():add(1)
+   self.output:resizeAs(input):copy(input):cdiv(self.temp)
+   return self.output
+end
+
+function SoftSign:updateGradInput(input, gradOutput)
+   self.tempgrad = self.tempgrad or input.new()
+   self.tempgrad:resizeAs(self.output):copy(input):abs():add(1):cmul(self.tempgrad)
+   self.gradInput:resizeAs(input):copy(gradOutput):cdiv(self.tempgrad)
+   return self.gradInput
+end
+
+function SoftSign:clearState()
+   nn.utils.clear(self, 'temp', 'tempgrad')
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/SparseJacobian.lua b/contrib/lua-torch/nn/SparseJacobian.lua
new file mode 100644
index 000000000..7f4c02444
--- /dev/null
+++ b/contrib/lua-torch/nn/SparseJacobian.lua
@@ -0,0 +1,277 @@
+nn.SparseJacobian = {}
+
+function nn.SparseJacobian.backward (module, input, param, dparam)
+   local doparam = 0
+   if param then
+      doparam = 1
+   end
+
+   -- output deriv
+   module:forward(input)
+   local dout = module.output.new():resizeAs(module.output)
+   -- 1D view
+   local sdout = module.output.new(dout:storage(), 1, dout:nElement())
+   -- jacobian matrix to calculate
+   local jacobian
+   if doparam == 1 then
+      jacobian = torch.Tensor(param:nElement(), dout:nElement()):zero()
+   else
+      jacobian = torch.Tensor(input:size(1), dout:nElement()):zero()
+   end
+
+   for i=1,sdout:nElement() do
+      dout:zero()
+      sdout[i] = 1
+      module:zeroGradParameters()
+      local din = module:updateGradInput(input, dout)
+      module:accGradParameters(input, dout)
+      if doparam == 1 then
+         jacobian:select(2,i):copy(dparam)
+      else
+         jacobian:select(2,i):copy(din:select(2,2))
+      end
+   end
+
+   return jacobian
+end
+
+
+function nn.SparseJacobian.backwardUpdate (module, input, param)
+
+   -- output deriv
+   module:forward(input)
+   local dout = module.output.new():resizeAs(module.output)
+   -- 1D view
+   local sdout = module.output.new(dout:storage(),1,dout:nElement())
+   -- jacobian matrix to calculate
+   local jacobian = torch.Tensor(param:nElement(),dout:nElement()):zero()
+
+   -- original param
+   local params = module:parameters()
+   local origparams = {}
+   for j=1,#params do
+      table.insert(origparams, params[j]:clone())
+   end
+
+   for i=1,sdout:nElement() do
+      -- Reset parameters
+      for j=1,#params do
+         params[j]:copy(origparams[j])
+      end
+      dout:zero()
+      sdout[i] = 1
+      module:zeroGradParameters()
+      module:updateGradInput(input, dout)
+      module:accUpdateGradParameters(input, dout, 1)
+      jacobian:select(2,i):copy(param)
+   end
+
+   for j=1,#params do
+      params[j]:copy(origparams[j])
+   end
+
+   return jacobian
+end
+
+function nn.SparseJacobian.forward(module, input, param)
+   local doparam = 0
+   if param then
+      doparam = 1
+   end
+   param = param or input
+
+   -- perturbation amount
+   local small = 1e-6
+   -- 1D view of input
+   --local tst = param:storage()
+   local sin
+   if doparam == 1 then
+      sin = param.new(param):resize(param:nElement())
+   else
+      sin = input.new(input):select(2,2)
+   end
+
+   local out = module:forward(input)
+   -- jacobian matrix to calculate
+   local jacobian
+   if doparam == 1 then
+      jacobian = torch.Tensor():resize(param:nElement(),
+                                       out:nElement())
+   else
+      jacobian = torch.Tensor():resize(input:size(1),
+                                       out:nElement())
+   end
+
+   local outa = torch.Tensor(jacobian:size(2))
+   local outb = torch.Tensor(jacobian:size(2))
+
+   for i=1,sin:nElement() do
+      sin[i] = sin[i] - small
+      outa:copy(module:forward(input))
+      sin[i] = sin[i] + 2*small
+      outb:copy(module:forward(input))
+      sin[i] = sin[i] - small
+
+      outb:add(-1,outa):div(2*small)
+      jacobian:select(1,i):copy(outb)
+   end
+
+   return jacobian
+end
+
+function nn.SparseJacobian.forwardUpdate(module, input, param)
+   -- perturbation amount
+   local small = 1e-6
+   -- 1D view of input
+   --local tst = param:storage()
+   local sin =  param.new(param):resize(param:nElement())--param.new(tst,1,tst:size())
+   -- jacobian matrix to calculate
+   local jacobian = torch.Tensor():resize(param:nElement(),module:forward(input):nElement())
+
+   local outa = torch.Tensor(jacobian:size(2))
+   local outb = torch.Tensor(jacobian:size(2))
+
+   for i=1,sin:nElement() do
+      sin[i] = sin[i] - small
+      outa:copy(module:forward(input))
+      sin[i] = sin[i] + 2*small
+      outb:copy(module:forward(input))
+      sin[i] = sin[i] - small
+
+      outb:add(-1,outa):div(2*small)
+      jacobian:select(1,i):copy(outb)
+      jacobian:select(1,i):mul(-1)
+      jacobian:select(1,i):add(sin[i])
+   end
+   return jacobian
+end
+
+function nn.SparseJacobian.testJacobian (module, input, minval, maxval)
+   minval = minval or -2
+   maxval = maxval or 2
+   local inrange = maxval - minval
+   input:select(2,2):copy(torch.rand(input:size(1)):mul(inrange):add(minval))
+   local jac_fprop = nn.SparseJacobian.forward(module,input)
+   local jac_bprop = nn.SparseJacobian.backward(module,input)
+   local error = jac_fprop-jac_bprop
+   return error:abs():max()
+end
+
+function nn.SparseJacobian.testJacobianParameters (module, input, param, dparam, minval, maxval)
+   minval = minval or -2
+   maxval = maxval or 2
+   local inrange = maxval - minval
+   input:select(2,2):copy(torch.rand(input:size(1)):mul(inrange):add(minval))
+   param:copy(torch.rand(param:nElement()):mul(inrange):add(minval))
+   local jac_bprop = nn.SparseJacobian.backward(module, input, param, dparam)
+   local jac_fprop = nn.SparseJacobian.forward(module, input, param)
+   local error = jac_fprop - jac_bprop
+   return error:abs():max()
+end
+
+function nn.SparseJacobian.testJacobianUpdateParameters (module, input, param, minval, maxval)
+   minval = minval or -2
+   maxval = maxval or 2
+   local inrange = maxval - minval
+   input:select(2,2):copy(torch.rand(input:size(1)):mul(inrange):add(minval))
+   param:copy(torch.rand(param:nElement()):mul(inrange):add(minval))
+   local params_bprop = nn.SparseJacobian.backwardUpdate(module, input, param)
+   local params_fprop = nn.SparseJacobian.forwardUpdate(module, input, param)
+
+   local error = params_fprop - params_bprop
+   return error:abs():max()
+end
+
+function nn.SparseJacobian.testIO(module,input, minval, maxval)
+   minval = minval or -2
+   maxval = maxval or 2
+   local inrange = maxval - minval
+
+   -- run module
+   module:forward(input)
+   local go = module.output:clone():copy(torch.rand(module.output:nElement()):mul(inrange):add(minval))
+   module:zeroGradParameters()
+   module:updateGradInput(input,go)
+   module:accGradParameters(input,go)
+
+   local fo = module.output:clone()
+   local bo = module.gradInput:clone()
+
+   -- write module
+   local f = torch.DiskFile('tmp.bin','w'):binary()
+   f:writeObject(module)
+   f:close()
+   -- read module
+   local m = torch.DiskFile('tmp.bin'):binary():readObject()
+   m:forward(input)
+   m:zeroGradParameters()
+   m:updateGradInput(input,go)
+   m:accGradParameters(input,go)
+   -- cleanup
+   os.remove('tmp.bin')
+
+   local fo2 = m.output:clone()
+   local bo2 = m.gradInput:clone()
+
+   local errf = fo - fo2
+   local errb = bo - bo2
+   return errf:abs():max(), errb:abs():max()
+end
+
+function nn.SparseJacobian.testAllUpdate(module, input, weight, gradWeight)
+   local gradOutput
+   local lr = torch.uniform(0.1, 1)
+   local errors = {}
+
+   -- accGradParameters
+   local maccgp = module:clone()
+   local weightc = maccgp[weight]:clone()
+   maccgp:forward(input)
+   gradOutput = torch.rand(maccgp.output:size())
+   maccgp:zeroGradParameters()
+   maccgp:updateGradInput(input, gradOutput)
+   maccgp:accGradParameters(input, gradOutput)
+   maccgp:updateParameters(lr)
+   errors["accGradParameters"] = (weightc-maccgp[gradWeight]*lr-maccgp[weight]):norm()
+
+   -- accUpdateGradParameters
+   local maccugp = module:clone()
+   maccugp:forward(input)
+   maccugp:updateGradInput(input, gradOutput)
+   maccugp:accUpdateGradParameters(input, gradOutput, lr)
+   errors["accUpdateGradParameters"] = (maccugp[weight]-maccgp[weight]):norm()
+
+   -- shared, accGradParameters
+   local macsh1 = module:clone()
+   local macsh2 = module:clone()
+   macsh2:share(macsh1, weight)
+   macsh1:forward(input)
+   macsh2:forward(input)
+   macsh1:zeroGradParameters()
+   macsh2:zeroGradParameters()
+   macsh1:updateGradInput(input, gradOutput)
+   macsh2:updateGradInput(input, gradOutput)
+   macsh1:accGradParameters(input, gradOutput)
+   macsh2:accGradParameters(input, gradOutput)
+   macsh1:updateParameters(lr)
+   macsh2:updateParameters(lr)
+   local err = (weightc-maccgp[gradWeight]*(lr*2)-macsh1[weight]):norm()
+   err = err + (weightc-maccgp[gradWeight]*(lr*2)-macsh2[weight]):norm()
+   errors["accGradParameters [shared]"] = err
+
+   -- shared, accUpdateGradParameters
+   local macshu1 = module:clone()
+   local macshu2 = module:clone()
+   macshu2:share(macshu1, weight)
+   macshu1:forward(input)
+   macshu2:forward(input)
+   macshu1:updateGradInput(input, gradOutput)
+   macshu2:updateGradInput(input, gradOutput)
+   macshu1:accUpdateGradParameters(input, gradOutput, lr)
+   macshu2:accUpdateGradParameters(input, gradOutput, lr)
+   err = (weightc-maccgp[gradWeight]*(lr*2)-macshu1[weight]):norm()
+   err = err + (weightc-maccgp[gradWeight]*(lr*2)-macshu2[weight]):norm()
+   errors["accUpdateGradParameters [shared]"] = err
+
+   return errors
+end
diff --git a/contrib/lua-torch/nn/SparseLinear.lua b/contrib/lua-torch/nn/SparseLinear.lua
new file mode 100644
index 000000000..9a50c6912
--- /dev/null
+++ b/contrib/lua-torch/nn/SparseLinear.lua
@@ -0,0 +1,242 @@
+local THNN = require 'nn.THNN'
+local SparseLinear, parent = torch.class('nn.SparseLinear', 'nn.Module')
+
+local NO_LAST_INPUT = 0
+local ONE_LAST_INPUT = 1
+local ACC_MULTIPLE_TIMES = 2
+
+function SparseLinear:__init(inputSize, outputSize, doGradInput)
+   parent.__init(self)
+
+   self.weightDecay = 0
+   self.doGradInput = doGradInput or false
+   self.weight = torch.Tensor(outputSize, inputSize):zero()
+   self.bias = torch.Tensor(outputSize):zero()
+   self.gradWeight = torch.Tensor(outputSize, inputSize):zero()
+   self.gradBias = torch.Tensor(outputSize):zero()
+
+   assert(type(self.doGradInput) == type(true))
+
+   self.lastInput = nil
+   self.sparseUpdate = NO_LAST_INPUT
+   self.formatted_input = nil
+
+   -- state
+   self.gradInput = {}
+   self.output:resize(outputSize)
+
+   self:reset()
+end
+
+function SparseLinear:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1./math.sqrt(self.weight:size(2))
+   end
+   self.weight:uniform(-stdv, stdv)
+   self.bias:uniform(-stdv, stdv):mul(0.000001)
+end
+
+function SparseLinear:reshapeInput(input)
+   if type(input) == 'table' then
+      return input, true, false
+   else
+      if input:dim() == 2 then
+         return {input}, false, false
+      else
+         return input, true, true
+      end
+   end
+end
+
+function SparseLinear:updateOutput(input)
+   if self.sparseUpdate == ONE_LAST_INPUT then
+      self.sparseUpdate = ACC_MULTIPLE_TIMES
+   end
+   local input, batchMode, legacyMode = self:reshapeInput(input)
+   self.legacyMode = legacyMode
+
+   if legacyMode then
+      input.THNN.SparseLinear_legacyUpdateOutput(
+         input:cdata(),
+         self.output:cdata(),
+         self.weight:cdata(),
+         self.bias:cdata()
+      )
+   else
+      local nbatches = #input
+      if nbatches == 0 then
+         self.output:copy(self.bias)
+         return self.output
+      end
+
+      local size = 0
+      local marker = 1
+      self.formatted_input = self.formatted_input or input[1].new()
+
+      for i,v in ipairs(input) do size = size + input[i]:size(1) end
+      self.formatted_input:resize(size, 3)
+      for i,v in ipairs(input) do
+         local buf = self.formatted_input:narrow(1, marker, input[i]:size(1))
+         buf:narrow(2,2,2):copy(input[i])
+         buf:select(2,1):fill(i)
+         marker = marker + input[i]:size(1)
+      end
+
+      self.output:resize(nbatches, self.weight:size(1))
+      input[1].THNN.SparseLinear_updateOutput(
+         self.formatted_input:cdata(),
+         self.output:cdata(),
+         self.weight:cdata(),
+         self.bias:cdata()
+      )
+
+      -- fix output size for batchSize = 1
+      if not batchMode then
+         self.output = self.output[1]
+      end
+   end
+
+   return self.output
+end
+
+function SparseLinear:accGradParameters(input, gradOutput, scale)
+   local input, batchMode, legacyMode = self:reshapeInput(input)
+   self.legacyMode = legacyMode
+   self.lastInput = self.lastInput or gradOutput.new()
+   if self.sparseUpdate == NO_LAST_INPUT then
+      local v = self.formatted_input
+      if self.legacyMode then v = input end
+      self.lastInput:resizeAs(v):copy(v)
+      self.sparseUpdate = ONE_LAST_INPUT
+   elseif self.sparseUpdate == ONE_LAST_INPUT then
+      self.sparseUpdate = ACC_MULTIPLE_TIMES
+   end
+
+   if legacyMode then
+      input.THNN.SparseLinear_legacyAccGradParameters(
+         input:cdata(),
+         gradOutput:cdata(),
+         self.gradWeight:cdata(),
+         self.gradBias:cdata(),
+         self.weight:cdata(),
+         self.bias:cdata(),
+         self.weightDecay or 0,
+         scale or 1
+      )
+   else
+      if not batchMode then
+         gradOutput:resize(1, gradOutput:size(1))
+      end
+
+      local rows = self.formatted_input:select(2, 1)
+      local cols = self.formatted_input:select(2, 2)
+      local sortinds = cols * gradOutput:size(1) + rows
+      local _, inds = sortinds:sort(1, false)
+      local newinput = self.formatted_input:index(1, inds)
+      input[1].THNN.SparseLinear_accGradParameters(
+         newinput:cdata(),
+         gradOutput:cdata(),
+         self.gradWeight:cdata(),
+         self.gradBias:cdata(),
+         self.weight:cdata(),
+         self.bias:cdata(),
+         self.weightDecay or 0,
+         scale or 1
+      )
+   end
+end
+
+function SparseLinear:updateGradInput(input, gradOutput)
+   if self.legacyMode then
+      if type(self.gradInput) ~= type(gradOutput) then self.gradInput = gradOutput.new() end
+      self.gradInput:resizeAs(input)
+   else
+      self.gradInput = {}
+   end
+   if self.doGradInput then
+      -- GradInput should be dense anyway
+      local gi
+      local batchMode = true
+      if gradOutput:dim() == 1 then
+         gi = self.weight:t()*gradOutput
+         batchMode = false
+      elseif gradOutput:dim() == 2 then
+         gi = gradOutput*self.weight
+      end
+      local ini = self.weight:size(2)
+
+      if self.legacyMode then
+         local batches = self.gradInput:size(1)
+         self.gradInput:resize(batches, ini, 2)
+         self.gradInput:select(3,1):copy(torch.repeatTensor(torch.range(1, ini), batches, 1))
+         self.gradInput:select(3,2):copy(gi)
+      else
+         local indicies = torch.range(1, ini)
+         if not batchMode then gi:resize(1, ini) end
+         for i = 1,gi:size(1) do
+            self.gradInput[i] = gradOutput.new(ini, 2)
+            self.gradInput[i]:select(2, 2):copy(gi[i])
+            self.gradInput[i]:select(2, 1):range(1, ini)
+         end
+      end
+   end
+   return self.gradInput
+end
+
+-- These functions do sparse updates / zeros. However, if we accumulated
+-- gradients multiple times, we can't depend on the last input to do sparse
+-- updates.
+function SparseLinear:updateParameters(learningRate)
+   if self.lastInput and self.sparseUpdate == ONE_LAST_INPUT then
+      if self.legacyMode then
+         self.lastInput.THNN.SparseLinear_legacyUpdateParameters(
+            self.weight:cdata(),
+            self.bias:cdata(),
+            self.gradWeight:cdata(),
+            self.gradBias:cdata(),
+            self.lastInput:cdata(),
+            learningRate
+         )
+      else
+         self.lastInput.THNN.SparseLinear_updateParameters(
+            self.weight:cdata(),
+            self.bias:cdata(),
+            self.gradWeight:cdata(),
+            self.gradBias:cdata(),
+            self.lastInput:cdata(),
+            learningRate
+         )
+      end
+   else
+      parent.updateParameters(self, learningRate)
+   end
+end
+
+function SparseLinear:zeroGradParameters()
+   if self.lastInput and self.sparseUpdate == ONE_LAST_INPUT then
+      if self.legacyMode then
+         self.lastInput.THNN.SparseLinear_legacyZeroGradParameters(
+            self.gradWeight:cdata(),
+            self.gradBias:cdata(),
+            self.lastInput:cdata()
+         )
+      else
+         self.lastInput.THNN.SparseLinear_zeroGradParameters(
+            self.gradWeight:cdata(),
+            self.gradBias:cdata(),
+            self.lastInput:cdata()
+         )
+      end
+   else
+      parent.zeroGradParameters(self)
+   end
+   self.sparseUpdate = NO_LAST_INPUT
+end
+
+function SparseLinear:clearState()
+   if self.lastInput then self.lastInput:set() end
+   input.THNN.SparseLinear_cudaClearState()
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/SpatialAdaptiveAveragePooling.lua b/contrib/lua-torch/nn/SpatialAdaptiveAveragePooling.lua
new file mode 100644
index 000000000..2e223580a
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialAdaptiveAveragePooling.lua
@@ -0,0 +1,35 @@
+local SpatialAdaptiveAveragePooling, parent = torch.class('nn.SpatialAdaptiveAveragePooling', 'nn.Module')
+
+function SpatialAdaptiveAveragePooling:__init(W, H)
+   parent.__init(self)
+
+   self.W = W
+   self.H = H
+end
+
+function SpatialAdaptiveAveragePooling:updateOutput(input)
+   input.THNN.SpatialAdaptiveAveragePooling_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.W, self.H
+   )
+   return self.output
+end
+
+function SpatialAdaptiveAveragePooling:updateGradInput(input, gradOutput)
+   input.THNN.SpatialAdaptiveAveragePooling_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata()
+   )
+   return self.gradInput
+end
+
+-- for backward compat
+function SpatialAdaptiveAveragePooling:empty()
+   self:clearState()
+end
+
+function SpatialAdaptiveAveragePooling:clearState()
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/SpatialAdaptiveMaxPooling.lua b/contrib/lua-torch/nn/SpatialAdaptiveMaxPooling.lua
new file mode 100644
index 000000000..b78261c3d
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialAdaptiveMaxPooling.lua
@@ -0,0 +1,46 @@
+local SpatialAdaptiveMaxPooling, parent = torch.class('nn.SpatialAdaptiveMaxPooling', 'nn.Module')
+
+function SpatialAdaptiveMaxPooling:__init(W, H)
+   parent.__init(self)
+
+   self.W = W
+   self.H = H
+end
+
+function SpatialAdaptiveMaxPooling:updateOutput(input)
+   self.indices = self.indices or torch.LongTensor()
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
+      self.indices = torch.CudaLongTensor and self.indices:cudaLong() or self.indices
+   else
+      self.indices = self.indices:long()
+   end
+   input.THNN.SpatialAdaptiveMaxPooling_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.indices:cdata(),
+      self.W, self.H
+   )
+   return self.output
+end
+
+function SpatialAdaptiveMaxPooling:updateGradInput(input, gradOutput)
+   input.THNN.SpatialAdaptiveMaxPooling_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.indices:cdata()
+   )
+   return self.gradInput
+end
+
+-- for backward compat
+function SpatialAdaptiveMaxPooling:empty()
+   self:clearState()
+end
+
+function SpatialAdaptiveMaxPooling:clearState()
+   if self.indices then
+      self.indices:set()
+   end
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/SpatialAutoCropMSECriterion.lua b/contrib/lua-torch/nn/SpatialAutoCropMSECriterion.lua
new file mode 100644
index 000000000..97206a062
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialAutoCropMSECriterion.lua
@@ -0,0 +1,74 @@
+--[[
+   SpatialAutoCropMSECriterion.
+   Implements the MSECriterion when the spatial resolution of the input is less than
+   or equal to the spatial resolution of the target. It achieves this center-cropping
+   the target to the same spatial resolution of the input and the MSE is then
+   calculated between these cropped inputs
+]]
+local SpatialAutoCropMSECriterion, parent = torch.class('nn.SpatialAutoCropMSECriterion', 'nn.MSECriterion')
+
+function SpatialAutoCropMSECriterion:__init(sizeAverage)
+    parent.__init(self, sizeAverage)
+end
+
+local function centerCrop(input, cropSize)
+   assert(input:dim() == 3 or input:dim() == 4, "input should be a 3D or  4D tensor")
+   assert(#cropSize == 2, "cropSize should have two elements only")
+   local _input = input
+   if input:dim() == 3 then
+      _input = input:view(1, input:size(1), input:size(2), input:size(3))
+   end
+   assert(cropSize[1] > 0 and cropSize[1] <= _input:size(3),
+         "0 < cropSize[1] <= input:size(3) not satisfied")
+   assert(cropSize[2] > 0 and cropSize[2] <= _input:size(4),
+        "0 < cropSize[1] <= input:size(3) not satisfied")
+
+   local inputHeight = _input:size(3)
+   local inputWidth = _input:size(4)
+
+   local rowStart = 1 + math.floor((inputHeight - cropSize[1])/2.0)
+   local rowEnd = rowStart + cropSize[1] - 1
+   local colStart = 1 +  math.floor((inputWidth - cropSize[2])/2.0)
+   local colEnd = colStart + cropSize[2] - 1
+   if input:dim() == 3 then
+      return input[{{}, {rowStart, rowEnd}, {colStart, colEnd}}]
+   else
+      return input[{{}, {}, {rowStart, rowEnd}, {colStart, colEnd}}]
+   end
+end
+
+local function getTensorHeightAndWidth(tensor)
+   local heightIdx = 2
+   local widthIdx = 3
+   if tensor:dim() == 4 then
+      heightIdx = 3
+      widthIdx = 4
+   end
+   return tensor:size(heightIdx), tensor:size(widthIdx)
+end
+
+local function inputResolutionIsSmallerThanTargetResolution(input, target)
+   local inputHeight, inputWidth = getTensorHeightAndWidth(input)
+   local targetHeight, targetWidth = getTensorHeightAndWidth(target)
+   return inputHeight <= targetHeight and inputWidth <= targetWidth
+end
+
+function SpatialAutoCropMSECriterion:updateOutput(input, target)
+   assert(input:dim() == target:dim(), "input and target should have the same number of dimensions")
+   assert(input:dim() == 4 or input:dim() == 3, "input and target must have 3 or 4 dimensions")
+   assert(inputResolutionIsSmallerThanTargetResolution(input, target),
+   "Spatial resolution of input should be less than or equal to the spatial resolution of the target")
+
+   local inputHeight, inputWidth = getTensorHeightAndWidth(input)
+   local targetCropped = centerCrop(target, {inputHeight, inputWidth})
+   return parent.updateOutput(self, input, targetCropped)
+end
+
+
+function SpatialAutoCropMSECriterion:updateGradInput(input, gradOutput)
+   assert(input:dim() == gradOutput:dim(), "input and gradOutput should have the same number of dimensions")
+   assert(input:dim() == 4 or input:dim() == 3, "input and gradOutput must have 3 or 4 dimensions")
+   assert(input:isSameSizeAs(gradOutput), "gradOutput and input must have the same size")
+
+   return parent.updateGradInput(self, input, gradOutput)
+end
diff --git a/contrib/lua-torch/nn/SpatialAveragePooling.lua b/contrib/lua-torch/nn/SpatialAveragePooling.lua
new file mode 100644
index 000000000..1e7666827
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialAveragePooling.lua
@@ -0,0 +1,93 @@
+local SpatialAveragePooling, parent = torch.class('nn.SpatialAveragePooling', 'nn.Module')
+
+function SpatialAveragePooling:__init(kW, kH, dW, dH, padW, padH)
+   parent.__init(self)
+
+   self.kW = kW
+   self.kH = kH
+   self.dW = dW or 1
+   self.dH = dH or 1
+   self.padW = padW or 0
+   self.padH = padH or 0
+   self.ceil_mode = false
+   self.count_include_pad = true
+   self.divide = true
+end
+
+function SpatialAveragePooling:ceil()
+   self.ceil_mode = true
+   return self
+end
+
+function SpatialAveragePooling:floor()
+   self.ceil_mode = false
+   return self
+end
+
+function SpatialAveragePooling:setCountIncludePad()
+   self.count_include_pad = true
+   return self
+end
+
+function SpatialAveragePooling:setCountExcludePad()
+   self.count_include_pad = false
+   return self
+end
+
+local function backwardCompatible(self)
+   if self.ceil_mode == nil then
+      self.ceil_mode = false
+      self.count_include_pad = true
+      self.padH = 0
+      self.padW = 0
+   end
+end
+
+function SpatialAveragePooling:updateOutput(input)
+   backwardCompatible(self)
+   input.THNN.SpatialAveragePooling_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH,
+      self.ceil_mode,
+      self.count_include_pad
+   )
+   -- for backward compatibility with saved models
+   -- which are not supposed to have "divide" field
+   if not self.divide then
+     self.output:mul(self.kW*self.kH)
+   end
+   return self.output
+end
+
+function SpatialAveragePooling:updateGradInput(input, gradOutput)
+   if self.gradInput then
+      input.THNN.SpatialAveragePooling_updateGradInput(
+         input:cdata(),
+         gradOutput:cdata(),
+         self.gradInput:cdata(),
+         self.kW, self.kH,
+         self.dW, self.dH,
+         self.padW, self.padH,
+         self.ceil_mode,
+         self.count_include_pad
+      )
+      -- for backward compatibility
+      if not self.divide then
+         self.gradInput:mul(self.kW*self.kH)
+      end
+      return self.gradInput
+   end
+end
+
+function SpatialAveragePooling:__tostring__()
+   local s = string.format('%s(%dx%d, %d,%d', torch.type(self),
+                            self.kW, self.kH, self.dW, self.dH)
+   if (self.padW or self.padH) and (self.padW ~= 0 or self.padH ~= 0) then
+      s = s .. ', ' .. self.padW .. ','.. self.padH
+   end
+   s = s .. ')'
+   return s
+end
diff --git a/contrib/lua-torch/nn/SpatialBatchNormalization.lua b/contrib/lua-torch/nn/SpatialBatchNormalization.lua
new file mode 100644
index 000000000..c5004ce3a
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialBatchNormalization.lua
@@ -0,0 +1,35 @@
+--[[
+   This file implements Batch Normalization as described in the paper:
+   "Batch Normalization: Accelerating Deep Network Training
+                         by Reducing Internal Covariate Shift"
+                by Sergey Ioffe, Christian Szegedy
+
+   This implementation is useful for inputs coming from convolution layers.
+   For non-convolutional layers, see BatchNormalization.lua
+
+   The operation implemented is:
+   y =     ( x - mean(x) )
+        -------------------- * gamma + beta
+        standard-deviation(x)
+   where gamma and beta are learnable parameters.
+
+   The learning of gamma and beta is optional.
+
+   Usage:
+   with    learnable parameters: nn.SpatialBatchNormalization(N [,eps] [,momentum])
+                                 where N = dimensionality of input
+   without learnable parameters: nn.SpatialBatchNormalization(N [,eps] [,momentum], false)
+
+   eps is a small value added to the variance to avoid divide-by-zero.
+       Defaults to 1e-5
+
+   In training time, this layer keeps a running estimate of it's computed mean and std.
+   The running sum is kept with a default momentum of 0.1 (unless over-ridden)
+   In test time, this running mean/std is used to normalize.
+]]--
+local BN, parent = torch.class('nn.SpatialBatchNormalization', 'nn.BatchNormalization')
+
+BN.__version = 2
+
+-- expected dimension of input
+BN.nDim = 4
diff --git a/contrib/lua-torch/nn/SpatialClassNLLCriterion.lua b/contrib/lua-torch/nn/SpatialClassNLLCriterion.lua
new file mode 100644
index 000000000..fbd367410
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialClassNLLCriterion.lua
@@ -0,0 +1,81 @@
+local THNN = require 'nn.THNN'
+local SpatialClassNLLCriterion, parent = torch.class('nn.SpatialClassNLLCriterion', 'nn.Criterion')
+
+function SpatialClassNLLCriterion:__init(weights, sizeAverage)
+    parent.__init(self)
+    if sizeAverage ~= nil then
+       self.sizeAverage = sizeAverage
+    else
+       self.sizeAverage = true
+    end
+    if weights then
+       assert(weights:dim() == 1, "weights input should be 1-D Tensor")
+       self.weights = weights
+    end
+
+    self.output_tensor = torch.zeros(1)
+    self.total_weight_tensor = torch.ones(1)
+    self.target = torch.zeros(1):long()
+end
+
+function SpatialClassNLLCriterion:__len()
+   if (self.weights) then
+      return #self.weights
+   else
+      return 0
+   end
+end
+
+function SpatialClassNLLCriterion:updateOutput(input, target)
+   if type(target) == 'number' then
+      if torch.typename(input):find('torch%.Cuda.*Tensor') then
+          self.target = torch.CudaLongTensor and self.target:cudaLong() or self.target:cuda()
+      else
+          self.target = self.target:long()
+      end
+      self.target[1] = target
+   elseif torch.typename(input):find('torch%.Cuda.*Tensor') then
+      self.target = torch.CudaLongTensor and target:cudaLong() or target
+   else
+      self.target = target:long()
+   end
+
+   input.THNN.SpatialClassNLLCriterion_updateOutput(
+      input:cdata(),
+      self.target:cdata(),
+      self.output_tensor:cdata(),
+      self.sizeAverage,
+      THNN.optionalTensor(self.weights),
+      self.total_weight_tensor:cdata()
+   )
+   self.output = self.output_tensor[1]
+   return self.output, self.total_weight_tensor[1]
+end
+
+function SpatialClassNLLCriterion:updateGradInput(input, target)
+   if type(target) == 'number' then
+      if torch.typename(input):find('torch%.Cuda.*Tensor') then
+          self.target = torch.CudaLongTensor and self.target:cudaLong() or self.target:cuda()
+      else
+          self.target = self.target:long()
+      end
+      self.target[1] = target
+   elseif torch.typename(input):find('torch%.Cuda.*Tensor') then
+      self.target = torch.CudaLongTensor and target:cudaLong() or target
+   else
+      self.target = target:long()
+   end
+
+   self.gradInput:resizeAs(input):zero()
+
+   input.THNN.SpatialClassNLLCriterion_updateGradInput(
+      input:cdata(),
+      self.target:cdata(),
+      self.gradInput:cdata(),
+      self.sizeAverage,
+      THNN.optionalTensor(self.weights),
+      self.total_weight_tensor:cdata()
+   )
+
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/SpatialContrastiveNormalization.lua b/contrib/lua-torch/nn/SpatialContrastiveNormalization.lua
new file mode 100644
index 000000000..0ad251ae4
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialContrastiveNormalization.lua
@@ -0,0 +1,36 @@
+local SpatialContrastiveNormalization, parent = torch.class('nn.SpatialContrastiveNormalization','nn.Module')
+
+function SpatialContrastiveNormalization:__init(nInputPlane, kernel, threshold, thresval)
+   parent.__init(self)
+
+   -- get args
+   self.nInputPlane = nInputPlane or 1
+   self.kernel = kernel or torch.Tensor(9,9):fill(1)
+   self.threshold = threshold or 1e-4
+   self.thresval = thresval or threshold or 1e-4
+   local kdim = self.kernel:nDimension()
+
+   -- check args
+   if kdim ~= 2 and kdim ~= 1 then
+      error('<SpatialContrastiveNormalization> averaging kernel must be 2D or 1D')
+   end
+   if (self.kernel:size(1) % 2) == 0 or (kdim == 2 and (self.kernel:size(2) % 2) == 0) then
+      error('<SpatialContrastiveNormalization> averaging kernel must have ODD dimensions')
+   end
+
+   -- instantiate sub+div normalization
+   self.normalizer = nn.Sequential()
+   self.normalizer:add(nn.SpatialSubtractiveNormalization(self.nInputPlane, self.kernel))
+   self.normalizer:add(nn.SpatialDivisiveNormalization(self.nInputPlane, self.kernel,
+                                                       self.threshold, self.thresval))
+end
+
+function SpatialContrastiveNormalization:updateOutput(input)
+   self.output = self.normalizer:forward(input)
+   return self.output
+end
+
+function SpatialContrastiveNormalization:updateGradInput(input, gradOutput)
+   self.gradInput = self.normalizer:backward(input, gradOutput)
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/SpatialConvolution.lua b/contrib/lua-torch/nn/SpatialConvolution.lua
new file mode 100644
index 000000000..15a2b4b62
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialConvolution.lua
@@ -0,0 +1,155 @@
+local THNN = require 'nn.THNN'
+local SpatialConvolution, parent = torch.class('nn.SpatialConvolution', 'nn.Module')
+
+function SpatialConvolution:__init(nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH)
+   parent.__init(self)
+
+   dW = dW or 1
+   dH = dH or 1
+
+   self.nInputPlane = nInputPlane
+   self.nOutputPlane = nOutputPlane
+   self.kW = kW
+   self.kH = kH
+
+   self.dW = dW
+   self.dH = dH
+   self.padW = padW or 0
+   self.padH = padH or self.padW
+
+   self.weight = torch.Tensor(nOutputPlane, nInputPlane, kH, kW)
+   self.bias = torch.Tensor(nOutputPlane)
+   self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane, kH, kW)
+   self.gradBias = torch.Tensor(nOutputPlane)
+
+   self:reset()
+end
+
+function SpatialConvolution:noBias()
+   self.bias = nil
+   self.gradBias = nil
+   return self
+end
+
+function SpatialConvolution:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1/math.sqrt(self.kW*self.kH*self.nInputPlane)
+   end
+   if nn.oldSeed then
+      self.weight:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end)
+      if self.bias then
+         self.bias:apply(function()
+         return torch.uniform(-stdv, stdv)
+         end)
+      end
+   else
+      self.weight:uniform(-stdv, stdv)
+      if self.bias then
+         self.bias:uniform(-stdv, stdv)
+      end
+   end
+end
+
+local function backCompatibility(self)
+   self.finput = self.finput or self.weight.new()
+   self.fgradInput = self.fgradInput or self.weight.new()
+   if self.padding then
+      self.padW = self.padding
+      self.padH = self.padding
+      self.padding = nil
+   else
+      self.padW = self.padW or 0
+      self.padH = self.padH or 0
+   end
+   if self.weight:dim() == 2 then
+      self.weight = self.weight:view(self.nOutputPlane, self.nInputPlane, self.kH, self.kW)
+   end
+   if self.gradWeight and self.gradWeight:dim() == 2 then
+      self.gradWeight = self.gradWeight:view(self.nOutputPlane, self.nInputPlane, self.kH, self.kW)
+   end
+end
+
+function SpatialConvolution:updateOutput(input)
+   assert(input.THNN, torch.type(input)..'.THNN backend not imported')
+   backCompatibility(self)
+   input.THNN.SpatialConvolutionMM_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.weight:cdata(),
+      THNN.optionalTensor(self.bias),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH
+   )
+   return self.output
+end
+
+function SpatialConvolution:updateGradInput(input, gradOutput)
+   assert(input.THNN, torch.type(input)..'.THNN backend not imported')
+   if self.gradInput then
+      backCompatibility(self)
+      input.THNN.SpatialConvolutionMM_updateGradInput(
+         input:cdata(),
+         gradOutput:cdata(),
+         self.gradInput:cdata(),
+         self.weight:cdata(),
+         self.finput:cdata(),
+         self.fgradInput:cdata(),
+         self.kW, self.kH,
+         self.dW, self.dH,
+         self.padW, self.padH
+      )
+      return self.gradInput
+   end
+end
+
+function SpatialConvolution:accGradParameters(input, gradOutput, scale)
+   assert(input.THNN, torch.type(input)..'.THNN backend not imported')
+   scale = scale or 1
+   backCompatibility(self)
+   input.THNN.SpatialConvolutionMM_accGradParameters(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradWeight:cdata(),
+      THNN.optionalTensor(self.gradBias),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH,
+      scale
+   )
+end
+
+function SpatialConvolution:type(type,tensorCache)
+   self.finput = self.finput and torch.Tensor()
+   self.fgradInput = self.fgradInput and torch.Tensor()
+   return parent.type(self,type,tensorCache)
+end
+
+function SpatialConvolution:__tostring__()
+   local s = string.format('%s(%d -> %d, %dx%d', torch.type(self),
+         self.nInputPlane, self.nOutputPlane, self.kW, self.kH)
+   if self.dW ~= 1 or self.dH ~= 1 or self.padW ~= 0 or self.padH ~= 0 then
+     s = s .. string.format(', %d,%d', self.dW, self.dH)
+   end
+   if (self.padW or self.padH) and (self.padW ~= 0 or self.padH ~= 0) then
+     s = s .. ', ' .. self.padW .. ',' .. self.padH
+   end
+   if self.bias then
+      return s .. ')'
+   else
+      return s .. ') without bias'
+   end
+end
+
+function SpatialConvolution:clearState()
+   nn.utils.clear(self, 'finput', 'fgradInput', '_input', '_gradOutput')
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/SpatialConvolutionLocal.lua b/contrib/lua-torch/nn/SpatialConvolutionLocal.lua
new file mode 100644
index 000000000..9494c2ffe
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialConvolutionLocal.lua
@@ -0,0 +1,188 @@
+local SpatialConvolutionLocal, parent = torch.class('nn.SpatialConvolutionLocal', 'nn.Module')
+
+function SpatialConvolutionLocal:__init(nInputPlane, nOutputPlane, iW, iH ,kW, kH, dW, dH, padW, padH)
+   parent.__init(self)
+
+   dW = dW or 1
+   dH = dH or 1
+
+   self.nInputPlane = nInputPlane
+   self.nOutputPlane = nOutputPlane
+   self.kW = kW
+   self.kH = kH
+   self.iW = iW
+   self.iH = iH
+
+   self.dW = dW
+   self.dH = dH
+   self.padW = padW or 0
+   self.padH = padH or self.padW
+   self.oW = math.floor((self.padW * 2 + iW - self.kW) / self.dW) + 1
+   self.oH = math.floor((self.padH * 2 + iH - self.kH) / self.dH) + 1
+   assert(1 <= self.oW and 1 <= self.oH, 'illegal configuration: output width or height less than 1')
+
+   self.weight = torch.Tensor(self.oH, self.oW, nOutputPlane, nInputPlane, kH, kW)
+   self.bias = torch.Tensor(nOutputPlane, self.oH, self.oW)
+   self.gradWeight = torch.Tensor():resizeAs(self.weight)
+   self.gradBias = torch.Tensor():resizeAs(self.bias)
+
+   self:reset()
+end
+
+function SpatialConvolutionLocal:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1/math.sqrt(self.kW*self.kH*self.nInputPlane)
+   end
+   if nn.oldSeed then
+      self.weight:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end)
+      self.bias:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end)
+   else
+      self.weight:uniform(-stdv, stdv)
+      self.bias:uniform(-stdv, stdv)
+   end
+end
+
+local function viewWeight(self)
+   self.weight = self.weight:view(self.oH * self.oW, self.nOutputPlane, self.nInputPlane * self.kH * self.kW)
+   if self.gradWeight and self.gradWeight:dim() > 0 then
+      self.gradWeight = self.gradWeight:view(self.oH * self.oW, self.nOutputPlane, self.nInputPlane * self.kH * self.kW)
+   end
+end
+
+local function unviewWeight(self)
+   self.weight = self.weight:view(self.oH, self.oW, self.nOutputPlane, self.nInputPlane, self.kH, self.kW)
+   if self.gradWeight and self.gradWeight:dim() > 0 then
+      self.gradWeight = self.gradWeight:view(self.oH, self.oW, self.nOutputPlane, self.nInputPlane, self.kH, self.kW)
+   end
+end
+
+local function checkInputSize(self, input)
+   if input:nDimension() == 3 then
+      if input:size(1) ~= self.nInputPlane or input:size(2) ~= self.iH or input:size(3) ~= self.iW then
+         error(string.format('Given input size: (%dx%dx%d) inconsistent with expected input size: (%dx%dx%d).',
+                             input:size(1), input:size(2), input:size(3), self.nInputPlane, self.iH, self.iW))
+      end
+   elseif input:nDimension() == 4 then
+      if input:size(2) ~= self.nInputPlane or input:size(3) ~= self.iH or input:size(4) ~= self.iW then
+         error(string.format('Given input size: (%dx%dx%dx%d) inconsistent with expected input size: (batchsize x%dx%dx%d).',
+                              input:size(1), input:size(2), input:size(3), input:size(4), self.nInputPlane, self.iH, self.iW))
+      end
+   else
+      error('3D or 4D(batch mode) tensor expected')
+   end
+end
+
+local function checkOutputSize(self, input, output)
+   if output:nDimension() ~= input:nDimension() then
+      error('inconsistent dimension between output and input.')
+   end
+   if output:nDimension() == 3 then
+      if output:size(1) ~= self.nOutputPlane or output:size(2) ~= self.oH or output:size(3) ~= self.oW then
+         error(string.format('Given output size: (%dx%dx%d) inconsistent with expected output size: (%dx%dx%d).',
+                             output:size(1), output:size(2), output:size(3), self.nOutputPlane, self.oH, self.oW))
+      end
+   elseif output:nDimension() == 4 then
+      if output:size(2) ~= self.nOutputPlane or output:size(3) ~= self.oH or output:size(4) ~= self.oW then
+         error(string.format('Given output size: (%dx%dx%dx%d) inconsistent with expected output size: (batchsize x%dx%dx%d).',
+                              output:size(1), output:size(2), output:size(3), output:size(4), self.nOutputPlane, self.oH, self.oW))
+      end
+   else
+      error('3D or 4D(batch mode) tensor expected')
+   end
+end
+
+function SpatialConvolutionLocal:updateOutput(input)
+   self.finput = self.finput or input.new()
+   self.fgradInput = self.fgradInput or input.new()
+   checkInputSize(self, input)
+   viewWeight(self)
+   input.THNN.SpatialConvolutionLocal_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.weight:cdata(),
+      self.bias:cdata(),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH,
+      self.iW, self.iH,
+      self.oW, self.oH
+   )
+   unviewWeight(self)
+   return self.output
+end
+
+function SpatialConvolutionLocal:updateGradInput(input, gradOutput)
+   checkInputSize(self, input)
+   checkOutputSize(self, input, gradOutput)
+   if self.gradInput then
+      viewWeight(self)
+      input.THNN.SpatialConvolutionLocal_updateGradInput(
+         input:cdata(),
+         gradOutput:cdata(),
+         self.gradInput:cdata(),
+         self.weight:cdata(),
+         self.finput:cdata(),
+         self.fgradInput:cdata(),
+         self.kW, self.kH,
+         self.dW, self.dH,
+         self.padW, self.padH,
+         self.iW, self.iH,
+         self.oW, self.oH
+      )
+      unviewWeight(self)
+      return self.gradInput
+   end
+end
+
+function SpatialConvolutionLocal:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   checkInputSize(self, input)
+   checkOutputSize(self, input, gradOutput)
+   viewWeight(self)
+   input.THNN.SpatialConvolutionLocal_accGradParameters(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradWeight:cdata(),
+      self.gradBias:cdata(),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH,
+      self.iW, self.iH,
+      self.oW, self.oH,
+      scale
+   )
+   unviewWeight(self)
+end
+
+function SpatialConvolutionLocal:type(type,tensorCache)
+   self.finput = self.finput and torch.Tensor()
+   self.fgradInput = self.fgradInput and torch.Tensor()
+   return parent.type(self,type,tensorCache)
+end
+
+function SpatialConvolutionLocal:__tostring__()
+   local s = string.format('%s(%d -> %d, %dx%d, %dx%d', torch.type(self),
+         self.nInputPlane, self.nOutputPlane, self.iW, self.iH, self.kW, self.kH)
+   if self.dW ~= 1 or self.dH ~= 1 or self.padW ~= 0 or self.padH ~= 0 then
+     s = s .. string.format(', %d,%d', self.dW, self.dH)
+   end
+   if (self.padW or self.padH) and (self.padW ~= 0 or self.padH ~= 0) then
+     s = s .. ', ' .. self.padW .. ',' .. self.padH
+   end
+   return s .. ')'
+end
+
+function SpatialConvolutionLocal:clearState()
+   nn.utils.clear(self, 'finput', 'fgradInput', '_input', '_gradOutput')
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/SpatialConvolutionMM.lua b/contrib/lua-torch/nn/SpatialConvolutionMM.lua
new file mode 100644
index 000000000..f20734f9b
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialConvolutionMM.lua
@@ -0,0 +1,139 @@
+local THNN = require 'nn.THNN'
+local SpatialConvolutionMM, parent = torch.class('nn.SpatialConvolutionMM', 'nn.Module')
+
+function SpatialConvolutionMM:__init(nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH)
+   parent.__init(self)
+
+   dW = dW or 1
+   dH = dH or 1
+
+   self.nInputPlane = nInputPlane
+   self.nOutputPlane = nOutputPlane
+   self.kW = kW
+   self.kH = kH
+
+   self.dW = dW
+   self.dH = dH
+   self.padW = padW or 0
+   self.padH = padH or self.padW
+
+   self.weight = torch.Tensor(nOutputPlane, nInputPlane*kH*kW)
+   self.bias = torch.Tensor(nOutputPlane)
+   self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane*kH*kW)
+   self.gradBias = torch.Tensor(nOutputPlane)
+
+   self:reset()
+end
+
+function SpatialConvolutionMM:noBias()
+   self.bias = nil
+   self.gradBias = nil
+   return self
+end
+
+function SpatialConvolutionMM:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1/math.sqrt(self.kW*self.kH*self.nInputPlane)
+   end
+   if nn.oldSeed then
+      self.weight:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end)
+      self.bias:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end)
+   else
+      self.weight:uniform(-stdv, stdv)
+      self.bias:uniform(-stdv, stdv)
+   end
+end
+
+function SpatialConvolutionMM:updateOutput(input)
+   assert(input.THNN, torch.type(input)..'.THNN backend not imported')
+   self.finput = self.finput or input.new()
+   self.fgradInput = self.fgradInput or input.new()
+   -- backward compatibility
+   if self.padding then
+      self.padW = self.padding
+      self.padH = self.padding
+      self.padding = nil
+   end
+   input.THNN.SpatialConvolutionMM_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.weight:cdata(),
+      THNN.optionalTensor(self.bias),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH
+   )
+   return self.output
+end
+
+function SpatialConvolutionMM:updateGradInput(input, gradOutput)
+   assert(input.THNN, torch.type(input)..'.THNN backend not imported')
+   if self.gradInput then
+      input.THNN.SpatialConvolutionMM_updateGradInput(
+         input:cdata(),
+         gradOutput:cdata(),
+         self.gradInput:cdata(),
+         self.weight:cdata(),
+         self.finput:cdata(),
+         self.fgradInput:cdata(),
+         self.kW, self.kH,
+         self.dW, self.dH,
+         self.padW, self.padH
+      )
+      return self.gradInput
+   end
+end
+
+function SpatialConvolutionMM:accGradParameters(input, gradOutput, scale)
+   assert(input.THNN, torch.type(input)..'.THNN backend not imported')
+   scale = scale or 1
+   assert((self.bias and self.gradBias) or (self.bias == nil and self.gradBias == nil))
+   input.THNN.SpatialConvolutionMM_accGradParameters(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradWeight:cdata(),
+      THNN.optionalTensor(self.gradBias),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH,
+      scale
+   )
+end
+
+function SpatialConvolutionMM:type(type,tensorCache)
+   self.finput = self.finput and torch.Tensor()
+   self.fgradInput = self.fgradInput and torch.Tensor()
+   return parent.type(self,type,tensorCache)
+end
+
+function SpatialConvolutionMM:__tostring__()
+   local s = string.format('%s(%d -> %d, %dx%d', torch.type(self),
+         self.nInputPlane, self.nOutputPlane, self.kW, self.kH)
+   if self.dW ~= 1 or self.dH ~= 1 or self.padW ~= 0 or self.padH ~= 0 then
+     s = s .. string.format(', %d,%d', self.dW, self.dH)
+   end
+   if (self.padW or self.padH) and (self.padW ~= 0 or self.padH ~= 0) then
+     s = s .. ', ' .. self.padW .. ',' .. self.padH
+   end
+   if self.bias then
+      return s .. ')'
+   else
+      return s .. ') without bias'
+   end
+end
+
+function SpatialConvolutionMM:clearState()
+   nn.utils.clear(self, 'finput', 'fgradInput', '_input', '_gradOutput')
+   return parent.clearState(self)
+end
+
diff --git a/contrib/lua-torch/nn/SpatialConvolutionMap.lua b/contrib/lua-torch/nn/SpatialConvolutionMap.lua
new file mode 100644
index 000000000..9051c119e
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialConvolutionMap.lua
@@ -0,0 +1,154 @@
+local SpatialConvolutionMap, parent = torch.class('nn.SpatialConvolutionMap', 'nn.Module')
+
+nn.tables = nn.tables or {}
+
+function nn.tables.full(nin, nout)
+   local ft = torch.Tensor(nin*nout,2)
+   local p = 1
+   for j=1,nout do
+      for i=1,nin do
+	 ft[p][1] = i
+	 ft[p][2] = j
+	 p = p + 1
+      end
+   end
+   return ft
+end
+
+function nn.tables.oneToOne(nfeat)
+   local ft = torch.Tensor(nfeat,2)
+   for i=1,nfeat do
+      ft[i][1] = i
+      ft[i][2] = i
+   end
+   return ft
+end
+
+function nn.tables.random(nin, nout, nto)
+   local nker = nto * nout
+   local tbl = torch.Tensor(nker, 2)
+   local fi = torch.randperm(nin)
+   local frcntr = 1
+   local nfi = math.floor(nin/nto) -- number of distinct nto chunks
+   local totbl = tbl:select(2,2)
+   local frtbl = tbl:select(2,1)
+   local fitbl = fi:narrow(1, 1, (nfi * nto)) -- part of fi that covers distinct chunks
+   local ufrtbl= frtbl:unfold(1, nto, nto)
+   local utotbl= totbl:unfold(1, nto, nto)
+   local ufitbl= fitbl:unfold(1, nto, nto)
+
+   -- start filling frtbl
+   for i=1,nout do -- fro each unit in target map
+      ufrtbl:select(1,i):copy(ufitbl:select(1,frcntr))
+      frcntr = frcntr + 1
+      if frcntr-1 ==  nfi then -- reset fi
+	 fi:copy(torch.randperm(nin))
+	 frcntr = 1
+      end
+   end
+   for tocntr=1,utotbl:size(1) do
+      utotbl:select(1,tocntr):fill(tocntr)
+   end
+   return tbl
+end
+
+function SpatialConvolutionMap:__init(conMatrix, kW, kH, dW, dH)
+   parent.__init(self)
+
+   dW = dW or 1
+   dH = dH or 1
+
+   self.kW = kW
+   self.kH = kH
+   self.dW = dW
+   self.dH = dH
+   self.connTable = conMatrix
+   self.nInputPlane = self.connTable:select(2,1):max()
+   self.nOutputPlane = self.connTable:select(2,2):max()
+   self.weight = torch.Tensor(self.connTable:size(1), kH, kW)
+   self.bias = torch.Tensor(self.nOutputPlane)
+   self.gradWeight = torch.Tensor(self.connTable:size(1), kH, kW)
+   self.gradBias = torch.Tensor(self.nOutputPlane)
+
+   self:reset()
+end
+
+function SpatialConvolutionMap:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+      if nn.oldSeed then
+         self.weight:apply(function()
+            return torch.uniform(-stdv, stdv)
+         end)
+         self.bias:apply(function()
+            return torch.uniform(-stdv, stdv)
+         end)
+      else
+         self.weight:uniform(-stdv, stdv)
+         self.bias:uniform(-stdv, stdv)
+      end
+   else
+      local ninp = torch.Tensor(self.nOutputPlane):zero()
+      for i=1,self.connTable:size(1) do ninp[self.connTable[i][2]] =  ninp[self.connTable[i][2]]+1 end
+      for k=1,self.connTable:size(1) do
+         stdv = 1/math.sqrt(self.kW*self.kH*ninp[self.connTable[k][2]])
+         if nn.oldSeed then
+            self.weight:select(1,k):apply(function() return torch.uniform(-stdv,stdv) end)
+         else
+            self.weight:select(1,k):uniform(-stdv,stdv)
+         end
+      end
+      for k=1,self.bias:size(1) do
+         stdv = 1/math.sqrt(self.kW*self.kH*ninp[k])
+         self.bias[k] = torch.uniform(-stdv,stdv)
+      end
+   end
+end
+
+function SpatialConvolutionMap:updateOutput(input)
+   input.THNN.SpatialConvolutionMap_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.weight:cdata(),
+      self.bias:cdata(),
+      self.connTable:cdata(),
+      self.nInputPlane,
+      self.nOutputPlane,
+      self.dW, self.dH
+   )
+   return self.output
+end
+
+function SpatialConvolutionMap:updateGradInput(input, gradOutput)
+   input.THNN.SpatialConvolutionMap_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.weight:cdata(),
+      self.bias:cdata(),
+      self.connTable:cdata(),
+      self.nInputPlane,
+      self.nOutputPlane,
+      self.dW, self.dH
+   )
+   return self.gradInput
+end
+
+function SpatialConvolutionMap:accGradParameters(input, gradOutput, scale)
+   input.THNN.SpatialConvolutionMap_accGradParameters(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradWeight:cdata(),
+      self.gradBias:cdata(),
+      self.connTable:cdata(),
+      self.nInputPlane,
+      self.nOutputPlane,
+      self.dW, self.dH,
+      scale or 1
+   )
+end
+
+function SpatialConvolutionMap:decayParameters(decay)
+   self.weight:add(-decay, self.weight)
+   self.bias:add(-decay, self.bias)
+end
diff --git a/contrib/lua-torch/nn/SpatialCrossMapLRN.lua b/contrib/lua-torch/nn/SpatialCrossMapLRN.lua
new file mode 100644
index 000000000..088eb07f0
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialCrossMapLRN.lua
@@ -0,0 +1,153 @@
+local SpatialCrossMapLRN, parent = torch.class('nn.SpatialCrossMapLRN', 'nn.Module')
+
+function SpatialCrossMapLRN:__init(size, alpha, beta, k)
+  parent.__init(self)
+
+  self.size = size
+  self.alpha = alpha or 0.0001
+  self.beta = beta or 0.75
+  self.k = k or 1
+end
+
+function SpatialCrossMapLRN:updateOutput(input)
+  assert(input:dim() == 3 or input:dim() == 4,
+         'Input must be 3D or 4D')
+
+  self.scale = self.scale or input.new()
+
+  if torch.typename(input):find('torch%.Cuda.*Tensor') then
+     input.THNN.SpatialCrossMapLRN_updateOutput(
+        input:cdata(),
+        self.output:cdata(),
+        self.scale:cdata(),
+        self.size,
+        self.alpha,
+        self.beta,
+        self.k
+     )
+  else
+     local isBatch = true
+     if input:dim() == 3 then
+       input = nn.utils.addSingletonDimension(input)
+       isBatch = false
+     end
+
+     local batchSize   = input:size(1)
+     local channels    = input:size(2)
+     local inputHeight = input:size(3)
+     local inputWidth  = input:size(4)
+
+     self.output:resizeAs(input)
+     self.scale:resizeAs(input)
+
+     -- use output storage as temporary buffer
+     local inputSquare = self.output
+     inputSquare:pow(input, 2)
+
+     local prePad = (self.size - 1)/2 + 1
+     local prePadCrop = prePad > channels and channels or prePad
+
+     local scaleFirst = self.scale:select(2,1)
+     scaleFirst:zero()
+     -- compute first feature map normalization
+     for c = 1, prePadCrop do
+       scaleFirst:add(inputSquare:select(2, c))
+     end
+
+     -- reuse computations for next feature maps normalization
+     -- by adding the next feature map and removing the previous
+     for c = 2, channels do
+       local scalePrevious = self.scale:select(2, c -1)
+       local scaleCurrent  = self.scale:select(2, c)
+       scaleCurrent:copy(scalePrevious)
+       if c < channels - prePad + 2 then
+	 local squareNext   = inputSquare:select(2, c + prePad - 1)
+	 scaleCurrent:add(1, squareNext)
+       end
+       if c > prePad  then
+	 local squarePrevious = inputSquare:select(2, c - prePad )
+	 scaleCurrent:add(-1, squarePrevious)
+       end
+     end
+
+     self.scale:mul(self.alpha/self.size):add(self.k)
+
+     self.output:pow(self.scale,-self.beta)
+     self.output:cmul(input)
+
+     if not isBatch then
+       self.output = self.output[1]
+     end
+  end
+
+  return self.output
+end
+
+function SpatialCrossMapLRN:updateGradInput(input, gradOutput)
+  assert(input:dim() == 3 or input:dim() == 4,
+         'Input must be 3D or 4D')
+
+  if torch.typename(input):find('torch%.Cuda.*Tensor') then
+     input.THNN.SpatialCrossMapLRN_updateGradInput(
+        input:cdata(),
+        gradOutput:cdata(),
+        self.gradInput:cdata(),
+        self.scale:cdata(),
+        self.output:cdata(),
+        self.size,
+        self.alpha,
+        self.beta,
+        self.k
+     )
+  else
+     local isBatch = true
+     if input:dim() == 3 then
+       input = nn.utils.addSingletonDimension(input)
+       gradOutput = nn.utils.addSingletonDimension(gradOutput)
+       self.output = nn.utils.addSingletonDimension(self.output)
+       isBatch = false
+     end
+
+     local batchSize   = input:size(1)
+     local channels    = input:size(2)
+     local inputHeight = input:size(3)
+     local inputWidth  = input:size(4)
+
+     self.paddedRatio = self.paddedRatio or input.new()
+     self.accumRatio = self.accumRatio or input.new()
+     self.paddedRatio:resize(channels + self.size - 1, inputHeight, inputWidth)
+     self.accumRatio:resize(inputHeight,inputWidth)
+
+     local cacheRatioValue = 2*self.alpha*self.beta/self.size
+     local inversePrePad = self.size - (self.size - 1) / 2
+
+     self.gradInput:resizeAs(input)
+     self.gradInput:pow(self.scale,-self.beta):cmul(gradOutput)
+
+     self.paddedRatio:zero()
+     local paddedRatioCenter = self.paddedRatio:narrow(1, inversePrePad, channels)
+     for n = 1, batchSize do
+       paddedRatioCenter:cmul(gradOutput[n],self.output[n])
+       paddedRatioCenter:cdiv(self.scale[n])
+       self.accumRatio:sum(self.paddedRatio:narrow(1,1,self.size-1), 1)
+       for c = 1, channels do
+	 self.accumRatio:add(self.paddedRatio[c+self.size-1])
+	 self.gradInput[n][c]:addcmul(-cacheRatioValue, input[n][c], self.accumRatio)
+	 self.accumRatio:add(-1, self.paddedRatio[c])
+       end
+     end
+
+     if not isBatch then
+       self.gradInput = self.gradInput[1]
+       self.output = self.output[1]
+     end
+  end
+
+  return self.gradInput
+end
+
+
+function SpatialCrossMapLRN:clearState()
+   nn.utils.clear(self, 'scale', 'paddedRatio', 'accumRatio')
+  return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/SpatialDepthWiseConvolution.lua b/contrib/lua-torch/nn/SpatialDepthWiseConvolution.lua
new file mode 100644
index 000000000..1132f04cb
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialDepthWiseConvolution.lua
@@ -0,0 +1,139 @@
+local THNN = require 'nn.THNN'
+local SpatialDepthWiseConvolution, parent = torch.class('nn.SpatialDepthWiseConvolution', 'nn.Module')
+
+function SpatialDepthWiseConvolution:__init(nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH)
+   parent.__init(self)
+
+   dW = dW or 1
+   dH = dH or 1
+
+   self.nInputPlane = nInputPlane
+   self.nOutputPlane = nOutputPlane
+   self.kW = kW
+   self.kH = kH
+
+   self.dW = dW
+   self.dH = dH
+   self.padW = padW or 0
+   self.padH = padH or self.padW
+
+   self.weight = torch.Tensor(nOutputPlane, nInputPlane*kH*kW)
+   self.bias = torch.Tensor(nOutputPlane, nInputPlane)
+   self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane*kH*kW)
+   self.gradBias = torch.Tensor(nOutputPlane, nInputPlane)
+
+   self:reset()
+end
+
+function SpatialDepthWiseConvolution:noBias()
+   self.bias = nil
+   self.gradBias = nil
+   return self
+end
+
+function SpatialDepthWiseConvolution:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1/math.sqrt(self.kW*self.kH*self.nInputPlane)
+   end
+   if nn.oldSeed then
+      self.weight:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end)
+      self.bias:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end)
+   else
+      self.weight:uniform(-stdv, stdv)
+      self.bias:uniform(-stdv, stdv)
+   end
+end
+
+function SpatialDepthWiseConvolution:updateOutput(input)
+   assert(input.THNN, torch.type(input)..'.THNN backend not imported')
+   self.finput = self.finput or input.new()
+   self.fgradInput = self.fgradInput or input.new()
+   -- backward compatibility
+   if self.padding then
+      self.padW = self.padding
+      self.padH = self.padding
+      self.padding = nil
+   end
+   input.THNN.SpatialDepthWiseConvolution_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.weight:cdata(),
+      THNN.optionalTensor(self.bias),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH
+   )
+   return self.output
+end
+
+function SpatialDepthWiseConvolution:updateGradInput(input, gradOutput)
+   assert(input.THNN, torch.type(input)..'.THNN backend not imported')
+   if self.gradInput then
+      input.THNN.SpatialDepthWiseConvolution_updateGradInput(
+         input:cdata(),
+         gradOutput:cdata(),
+         self.gradInput:cdata(),
+         self.weight:cdata(),
+         self.finput:cdata(),
+         self.fgradInput:cdata(),
+         self.kW, self.kH,
+         self.dW, self.dH,
+         self.padW, self.padH
+      )
+      return self.gradInput
+   end
+end
+
+function SpatialDepthWiseConvolution:accGradParameters(input, gradOutput, scale)
+   assert(input.THNN, torch.type(input)..'.THNN backend not imported')
+   scale = scale or 1
+   assert((self.bias and self.gradBias) or (self.bias == nil and self.gradBias == nil))
+   input.THNN.SpatialDepthWiseConvolution_accGradParameters(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradWeight:cdata(),
+      THNN.optionalTensor(self.gradBias),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH,
+      scale
+   )
+end
+
+function SpatialDepthWiseConvolution:type(type,tensorCache)
+   self.finput = self.finput and torch.Tensor()
+   self.fgradInput = self.fgradInput and torch.Tensor()
+   return parent.type(self,type,tensorCache)
+end
+
+function SpatialDepthWiseConvolution:__tostring__()
+   local s = string.format('%s(%d -> %d, %dx%d', torch.type(self),
+         self.nInputPlane, self.nOutputPlane, self.kW, self.kH)
+   if self.dW ~= 1 or self.dH ~= 1 or self.padW ~= 0 or self.padH ~= 0 then
+     s = s .. string.format(', %d,%d', self.dW, self.dH)
+   end
+   if (self.padW or self.padH) and (self.padW ~= 0 or self.padH ~= 0) then
+     s = s .. ', ' .. self.padW .. ',' .. self.padH
+   end
+   if self.bias then
+      return s .. ')'
+   else
+      return s .. ') without bias'
+   end
+end
+
+function SpatialDepthWiseConvolution:clearState()
+   nn.utils.clear(self, 'finput', 'fgradInput', '_input', '_gradOutput')
+   return parent.clearState(self)
+end
+
diff --git a/contrib/lua-torch/nn/SpatialDilatedConvolution.lua b/contrib/lua-torch/nn/SpatialDilatedConvolution.lua
new file mode 100644
index 000000000..a0590c7e9
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialDilatedConvolution.lua
@@ -0,0 +1,80 @@
+local THNN = require 'nn.THNN'
+local SpatialDilatedConvolution, parent = torch.class('nn.SpatialDilatedConvolution', 'nn.SpatialConvolution')
+
+function SpatialDilatedConvolution:__init(nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH, dilationW, dilationH)
+   parent.__init(self, nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH)
+
+   self.dilationW = dilationW or 1
+   self.dilationH = dilationH or 1
+end
+
+function SpatialDilatedConvolution:updateOutput(input)
+   self.finput = self.finput or self.weight.new()
+   self.fgradInput = self.fgradInput or self.weight.new()
+   input.THNN.SpatialDilatedConvolution_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.weight:cdata(),
+      THNN.optionalTensor(self.bias),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH,
+      self.dilationW, self.dilationH
+   )
+   return self.output
+end
+
+function SpatialDilatedConvolution:updateGradInput(input, gradOutput)
+   if self.gradInput then
+      self.fgradInput = self.fgradInput or self.weight.new()
+      input.THNN.SpatialDilatedConvolution_updateGradInput(
+         input:cdata(),
+         gradOutput:cdata(),
+         self.gradInput:cdata(),
+         self.weight:cdata(),
+         self.finput:cdata(),
+         self.kW, self.kH,
+         self.dW, self.dH,
+         self.padW, self.padH,
+         self.dilationW, self.dilationH
+      )
+      return self.gradInput
+   end
+end
+
+function SpatialDilatedConvolution:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   self.fgradInput = self.fgradInput or self.weight.new()
+   input.THNN.SpatialDilatedConvolution_accGradParameters(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradWeight:cdata(),
+      THNN.optionalTensor(self.gradBias),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH,
+      self.dilationW, self.dilationH,
+      scale
+   )
+end
+
+function SpatialDilatedConvolution:__tostring__()
+   local s = string.format('%s(%d -> %d, %dx%d', torch.type(self),
+         self.nInputPlane, self.nOutputPlane, self.kW, self.kH)
+   if self.dW ~= 1 or self.dH ~= 1 or self.padW ~= 0 or self.padH ~= 0 then
+     s = s .. string.format(', %d,%d', self.dW, self.dH)
+   end
+   if (self.padW or self.padH) and (self.padW ~= 0 or self.padH ~= 0) then
+     s = s .. ', ' .. self.padW .. ',' .. self.padH
+   end
+   s = s .. ', ' .. self.dilationW .. ',' .. self.dilationH
+   if self.bias then
+      return s .. ')'
+   else
+      return s .. ') without bias'
+   end
+end
diff --git a/contrib/lua-torch/nn/SpatialDilatedMaxPooling.lua b/contrib/lua-torch/nn/SpatialDilatedMaxPooling.lua
new file mode 100644
index 000000000..34525a4ad
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialDilatedMaxPooling.lua
@@ -0,0 +1,67 @@
+local THNN = require 'nn.THNN'
+local SpatialDilatedMaxPooling, parent = torch.class('nn.SpatialDilatedMaxPooling', 'nn.SpatialMaxPooling')
+
+function SpatialDilatedMaxPooling:__init(kW, kH, dW, dH, padW, padH, dilationW, dilationH)
+   parent.__init(self, kW, kH, dW, dH, padW, padH)
+
+   self.dilationW = dilationW or 1
+   self.dilationH = dilationH or 1
+end
+
+function SpatialDilatedMaxPooling:updateOutput(input)
+   self.indices = self.indices or torch.LongTensor()
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
+      self.indices = torch.CudaLongTensor and self.indices:cudaLong() or self.indices
+   else
+      self.indices = self.indices:long()
+   end
+
+   local dims = input:dim()
+   self.iheight = input:size(dims-1)
+   self.iwidth = input:size(dims)
+
+   input.THNN.SpatialDilatedMaxPooling_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.indices:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH,
+      self.dilationW, self.dilationH,
+      self.ceil_mode
+   )
+   return self.output
+end
+
+function SpatialDilatedMaxPooling:updateGradInput(input, gradOutput)
+   input.THNN.SpatialDilatedMaxPooling_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.indices:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH,
+      self.dilationW, self.dilationH,
+      self.ceil_mode
+   )
+   return self.gradInput
+end
+
+function SpatialDilatedMaxPooling:__tostring__()
+   local s =  string.format('%s(%dx%d, %d,%d', torch.type(self),
+                            self.kW, self.kH, self.dW, self.dH)
+   if (self.padW or self.padH) and (self.padW ~= 0 or self.padH ~= 0) then
+      s = s .. ', ' .. self.padW .. ','.. self.padH
+   end
+   s = s .. ', ' .. self.dilationW .. ',' .. self.dilationH
+   s = s .. ')'
+   return s
+end
+
+function SpatialDilatedMaxPooling:clearState()
+   if self.indices then
+      self.indices:set()
+   end
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/SpatialDivisiveNormalization.lua b/contrib/lua-torch/nn/SpatialDivisiveNormalization.lua
new file mode 100644
index 000000000..dc2b8c530
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialDivisiveNormalization.lua
@@ -0,0 +1,136 @@
+local SpatialDivisiveNormalization, parent = torch.class('nn.SpatialDivisiveNormalization','nn.Module')
+
+function SpatialDivisiveNormalization:__init(nInputPlane, kernel, threshold, thresval)
+   parent.__init(self)
+
+   -- get args
+   self.nInputPlane = nInputPlane or 1
+   self.kernel = kernel or torch.Tensor(9,9):fill(1)
+   self.threshold = threshold or 1e-4
+   self.thresval = thresval or threshold or 1e-4
+   local kdim = self.kernel:nDimension()
+
+   -- check args
+   if kdim ~= 2 and kdim ~= 1 then
+      error('<SpatialDivisiveNormalization> averaging kernel must be 2D or 1D')
+   end
+   if (self.kernel:size(1) % 2) == 0 or (kdim == 2 and (self.kernel:size(2) % 2) == 0) then
+      error('<SpatialDivisiveNormalization> averaging kernel must have ODD dimensions')
+   end
+
+   -- padding values
+   local padH = math.floor(self.kernel:size(1)/2)
+   local padW = padH
+   if kdim == 2 then
+      padW = math.floor(self.kernel:size(2)/2)
+   end
+
+   -- create convolutional mean estimator
+   self.meanestimator = nn.Sequential()
+   self.meanestimator:add(nn.SpatialZeroPadding(padW, padW, padH, padH))
+   if kdim == 2 then
+      self.meanestimator:add(nn.SpatialConvolution(self.nInputPlane, 1, self.kernel:size(2), self.kernel:size(1)))
+   else
+      self.meanestimator:add(nn.SpatialConvolutionMap(nn.tables.oneToOne(self.nInputPlane), self.kernel:size(1), 1))
+      self.meanestimator:add(nn.SpatialConvolution(self.nInputPlane, 1, 1, self.kernel:size(1)))
+   end
+   self.meanestimator:add(nn.Replicate(self.nInputPlane,1,3))
+
+   -- create convolutional std estimator
+   self.stdestimator = nn.Sequential()
+   self.stdestimator:add(nn.Square())
+   self.stdestimator:add(nn.SpatialZeroPadding(padW, padW, padH, padH))
+   if kdim == 2 then
+      self.stdestimator:add(nn.SpatialConvolution(self.nInputPlane, 1, self.kernel:size(2), self.kernel:size(1)))
+   else
+      self.stdestimator:add(nn.SpatialConvolutionMap(nn.tables.oneToOne(self.nInputPlane), self.kernel:size(1), 1))
+      self.stdestimator:add(nn.SpatialConvolution(self.nInputPlane, 1, 1, self.kernel:size(1)))
+   end
+   self.stdestimator:add(nn.Replicate(self.nInputPlane,1,3))
+   self.stdestimator:add(nn.Sqrt())
+
+   -- set kernel and bias
+   if kdim == 2 then
+      self.kernel:div(self.kernel:sum() * self.nInputPlane)
+      for i = 1,self.nInputPlane do
+         self.meanestimator.modules[2].weight[1][i] = self.kernel
+         self.stdestimator.modules[3].weight[1][i] = self.kernel
+      end
+      self.meanestimator.modules[2].bias:zero()
+      self.stdestimator.modules[3].bias:zero()
+   else
+      self.kernel:div(self.kernel:sum() * math.sqrt(self.nInputPlane))
+      for i = 1,self.nInputPlane do
+         self.meanestimator.modules[2].weight[i]:copy(self.kernel)
+         self.meanestimator.modules[3].weight[1][i]:copy(self.kernel)
+         self.stdestimator.modules[3].weight[i]:copy(self.kernel)
+         self.stdestimator.modules[4].weight[1][i]:copy(self.kernel)
+      end
+      self.meanestimator.modules[2].bias:zero()
+      self.meanestimator.modules[3].bias:zero()
+      self.stdestimator.modules[3].bias:zero()
+      self.stdestimator.modules[4].bias:zero()
+   end
+
+   -- other operation
+   self.normalizer = nn.CDivTable()
+   self.divider = nn.CDivTable()
+   self.thresholder = nn.Threshold(self.threshold, self.thresval)
+
+   -- coefficient array, to adjust side effects
+   self.coef = torch.Tensor(1,1,1)
+end
+
+function SpatialDivisiveNormalization:updateOutput(input)
+
+   self.localstds = self.stdestimator:updateOutput(input)
+
+   -- compute side coefficients
+   local dim = input:dim()
+   if self.localstds:dim() ~= self.coef:dim() or (input:size(dim) ~= self.coef:size(dim)) or (input:size(dim-1) ~= self.coef:size(dim-1)) then
+      self.ones = self.ones or input.new()
+      if dim == 4 then
+         -- batch mode
+         self.ones:resizeAs(input[1]):fill(1)
+         local coef = self.meanestimator:updateOutput(self.ones)
+         self._coef = self._coef or input.new()
+         self._coef:resizeAs(coef):copy(coef) -- make contiguous for view
+         self.coef = self._coef:view(1,table.unpack(self._coef:size():totable())):expandAs(self.localstds)
+      else
+         self.ones:resizeAs(input):fill(1)
+         self.coef = self.meanestimator:updateOutput(self.ones)
+      end
+
+   end
+
+   -- normalize std dev
+   self.adjustedstds = self.divider:updateOutput{self.localstds, self.coef}
+   self.thresholdedstds = self.thresholder:updateOutput(self.adjustedstds)
+   self.output = self.normalizer:updateOutput{input, self.thresholdedstds}
+
+   -- done
+   return self.output
+end
+
+function SpatialDivisiveNormalization:updateGradInput(input, gradOutput)
+   -- resize grad
+   self.gradInput:resizeAs(input):zero()
+
+   -- backprop through all modules
+   local gradnorm = self.normalizer:updateGradInput({input, self.thresholdedstds}, gradOutput)
+   local gradadj = self.thresholder:updateGradInput(self.adjustedstds, gradnorm[2])
+   local graddiv = self.divider:updateGradInput({self.localstds, self.coef}, gradadj)
+   self.gradInput:add(self.stdestimator:updateGradInput(input, graddiv[1]))
+   self.gradInput:add(gradnorm[1])
+
+   -- done
+   return self.gradInput
+end
+
+function SpatialDivisiveNormalization:clearState()
+   if self.ones then self.ones:set() end
+   if self._coef then self._coef:set() end
+   self.meanestimator:clearState()
+   self.stdestimator:clearState()
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/SpatialDropout.lua b/contrib/lua-torch/nn/SpatialDropout.lua
new file mode 100644
index 000000000..4320061b7
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialDropout.lua
@@ -0,0 +1,55 @@
+local SpatialDropout, Parent = torch.class('nn.SpatialDropout', 'nn.Module')
+
+function SpatialDropout:__init(p,stochasticInference)
+   Parent.__init(self)
+   self.p = p or 0.5
+   self.train = true
+   self.stochastic_inference = stochasticInference or false
+   self.noise = torch.Tensor()
+end
+
+function SpatialDropout:updateOutput(input)
+   self.output:resizeAs(input):copy(input)
+   if self.train or self.stochastic_inference then
+      if input:dim() == 4 then
+        self.noise:resize(input:size(1), input:size(2), 1, 1)
+      elseif input:dim() == 3 then
+        self.noise:resize(input:size(1), 1, 1)
+      else
+        error('Input must be 4D (nbatch, nfeat, h, w) or 3D (nfeat, h, w)')
+      end
+      self.noise:bernoulli(1-self.p)
+      -- We expand the random dropouts to the entire feature map because the
+      -- features are likely correlated across the map and so the dropout
+      -- should also be correlated.
+      self.output:cmul(torch.expandAs(self.noise, input))
+   else
+      self.output:mul(1-self.p)
+   end
+   return self.output
+end
+
+function SpatialDropout:updateGradInput(input, gradOutput)
+   if self.train then
+      self.gradInput:resizeAs(gradOutput):copy(gradOutput)
+      self.gradInput:cmul(torch.expandAs(self.noise, input)) -- simply mask the gradients with the noise vector
+   else
+      error('backprop only defined while training')
+   end
+   return self.gradInput
+end
+
+function SpatialDropout:setp(p)
+   self.p = p
+end
+
+function SpatialDropout:__tostring__()
+  return string.format('%s(%f)', torch.type(self), self.p)
+end
+
+function SpatialDropout:clearState()
+  if self.noise then
+    self.noise:set()
+  end
+  return Parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/SpatialFractionalMaxPooling.lua b/contrib/lua-torch/nn/SpatialFractionalMaxPooling.lua
new file mode 100644
index 000000000..884751d41
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialFractionalMaxPooling.lua
@@ -0,0 +1,165 @@
+local SpatialFractionalMaxPooling, parent =
+   torch.class('nn.SpatialFractionalMaxPooling', 'nn.Module')
+
+-- Usage:
+-- nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH)
+--   the output should be the exact size (outH x outW)
+-- nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, ratioW, ratioH)
+--   the output should be the size (floor(inH x ratioH) x floor(inW x ratioW))
+--   ratios are numbers between (0, 1) exclusive
+function SpatialFractionalMaxPooling:__init(poolSizeW, poolSizeH, arg1, arg2)
+   parent.__init(self)
+   assert(poolSizeW >= 2)
+   assert(poolSizeH >= 2)
+
+   -- Pool size (how wide the pooling for each output unit is)
+   self.poolSizeW = poolSizeW
+   self.poolSizeH = poolSizeH
+
+   -- Random samples are drawn for all
+   -- batch * plane * (height, width; i.e., 2) points. This determines
+   -- the 2d "pseudorandom" overlapping pooling regions for each
+   -- (batch element x input plane). A new set of random samples is
+   -- drawn every updateOutput call, unless we disable it via
+   -- :fixPoolingRegions().
+   self.randomSamples = nil
+
+   -- Flag to disable re-generation of random samples for producing
+   -- a new pooling. For testing purposes
+   self.newRandomPool = false
+
+   if arg1 >= 1 and arg2 >= 1 then
+      -- Desired output size: the input tensor will determine the reduction
+      -- ratio
+      self.outW = arg1
+      self.outH = arg2
+   else
+      -- Reduction ratio specified per each input
+      -- This is the reduction ratio that we use
+      self.ratioW = arg1
+      self.ratioH = arg2
+
+      -- The reduction ratio must be between 0 and 1
+      assert(self.ratioW > 0 and self.ratioW < 1)
+      assert(self.ratioH > 0 and self.ratioH < 1)
+   end
+end
+
+function SpatialFractionalMaxPooling:getBufferSize_(input)
+   local batchSize = 0
+   local planeSize = 0
+
+   if input:nDimension() == 3 then
+      batchSize = 1
+      planeSize = input:size(1)
+   elseif input:nDimension() == 4 then
+      batchSize = input:size(1)
+      planeSize = input:size(2)
+   else
+      error('input must be dim 3 or 4')
+   end
+
+   return torch.LongStorage({batchSize, planeSize, 2})
+end
+
+function SpatialFractionalMaxPooling:initSampleBuffer_(input)
+   local sampleBufferSize = self:getBufferSize_(input)
+
+   if self.randomSamples == nil then
+      self.randomSamples = input.new():resize(sampleBufferSize):uniform()
+   elseif (self.randomSamples:size(1) ~= sampleBufferSize[1] or
+           self.randomSamples:size(2) ~= sampleBufferSize[2]) then
+      self.randomSamples:resize(sampleBufferSize):uniform()
+   else
+      if not self.newRandomPool then
+         -- Create new pooling windows, since this is a subsequent call
+         self.randomSamples:uniform()
+      end
+   end
+end
+
+function SpatialFractionalMaxPooling:getOutputSizes_(input)
+   local outW = self.outW
+   local outH = self.outH
+   if self.ratioW ~= nil and self.ratioH ~= nil then
+      if input:nDimension() == 4 then
+         outW = math.floor(input:size(4) * self.ratioW)
+         outH = math.floor(input:size(3) * self.ratioH)
+      elseif input:nDimension() == 3 then
+         outW = math.floor(input:size(3) * self.ratioW)
+         outH = math.floor(input:size(2) * self.ratioH)
+      else
+         error('input must be dim 3 or 4')
+      end
+
+      -- Neither can be smaller than 1
+      assert(outW > 0, 'reduction ratio or input width too small')
+      assert(outH > 0, 'reduction ratio or input height too small')
+   else
+      assert(outW ~= nil and outH ~= nil)
+   end
+
+   return outW, outH
+end
+
+-- Call this to turn off regeneration of random pooling regions each
+-- updateOutput call.
+function SpatialFractionalMaxPooling:fixPoolingRegions(val)
+   if val == nil then
+      val = true
+   end
+
+   self.newRandomPool = val
+   return self
+end
+
+function SpatialFractionalMaxPooling:updateOutput(input)
+   self.indices = self.indices or torch.LongTensor()
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
+      self.indices = torch.CudaLongTensor and self.indices:cudaLong() or self.indices
+   else
+      self.indices = self.indices:long()
+   end
+   self:initSampleBuffer_(input)
+   local outW, outH = self:getOutputSizes_(input)
+
+   input.THNN.SpatialFractionalMaxPooling_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      outW, outH, self.poolSizeW, self.poolSizeH,
+      self.indices:cdata(), self.randomSamples:cdata())
+   return self.output
+end
+
+function SpatialFractionalMaxPooling:updateGradInput(input, gradOutput)
+   assert(self.randomSamples ~= nil,
+          'must call updateOutput/forward first')
+
+   local outW, outH = self:getOutputSizes_(input)
+
+   input.THNN.SpatialFractionalMaxPooling_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      outW, outH, self.poolSizeW, self.poolSizeH,
+      self.indices:cdata())
+   return self.gradInput
+end
+
+-- backward compat
+function SpatialFractionalMaxPooling:empty()
+   self:clearState()
+end
+
+function SpatialFractionalMaxPooling:clearState()
+   self.indices = nil
+   self.randomSamples = nil
+   return parent.clearState(self)
+end
+
+function SpatialFractionalMaxPooling:__tostring__()
+   return string.format('%s(%dx%d, %d,%d)', torch.type(self),
+                        self.outW and self.outW or self.ratioW,
+                        self.outH and self.outH or self.ratioH,
+                        self.poolSizeW, self.poolSizeH)
+end
diff --git a/contrib/lua-torch/nn/SpatialFullConvolution.lua b/contrib/lua-torch/nn/SpatialFullConvolution.lua
new file mode 100644
index 000000000..e6019bc18
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialFullConvolution.lua
@@ -0,0 +1,219 @@
+local THNN = require 'nn.THNN'
+local SpatialFullConvolution, parent = torch.class('nn.SpatialFullConvolution','nn.Module')
+
+function SpatialFullConvolution:__init(nInputPlane, nOutputPlane,
+                                       kW, kH, dW, dH, padW, padH, adjW, adjH)
+   parent.__init(self)
+
+   dW = dW or 1
+   dH = dH or 1
+
+   self.nInputPlane = nInputPlane
+   self.nOutputPlane = nOutputPlane
+   self.kW = kW
+   self.kH = kH
+   self.dW = dW
+   self.dH = dH
+   self.padW = padW or 0
+   self.padH = padH or 0
+   self.adjW = adjW or 0
+   self.adjH = adjH or 0
+
+   if self.adjW > self.dW - 1 or self.adjH > self.dH - 1 then
+      error('adjW and adjH must be smaller than self.dW - 1' ..
+            ' and self.dH - 1 respectively')
+   end
+
+   self.weight = torch.Tensor(nInputPlane, nOutputPlane, kH, kW)
+   self.gradWeight = torch.Tensor(nInputPlane, nOutputPlane, kH, kW)
+   self.bias = torch.Tensor(self.nOutputPlane)
+   self.gradBias = torch.Tensor(self.nOutputPlane)
+
+   self.ones = torch.Tensor()
+
+   self:reset()
+end
+
+function SpatialFullConvolution:noBias()
+	self.bias = nil
+	self.gradBias = nil
+	return self
+end
+
+function SpatialFullConvolution:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      local nInputPlane = self.nInputPlane
+      local kH = self.kH
+      local kW = self.kW
+      stdv = 1/math.sqrt(kW*kH*nInputPlane)
+   end
+   self.weight:uniform(-stdv, stdv)
+   if self.bias then
+      self.bias:uniform(-stdv, stdv)
+   end
+end
+
+local function calculateAdj(targetSize, ker, pad, stride)
+  return (targetSize + 2 * pad - ker) % stride
+end
+
+function SpatialFullConvolution:backCompatibility()
+  self.adjW = self.adjW or 0
+  self.adjH = self.adjH or 0
+end
+
+function SpatialFullConvolution:updateOutput(input)
+  self:backCompatibility()
+
+  local inputTensor = input
+  local adjW, adjH = self.adjW, self.adjH
+
+  -- The input can be a table where the second element indicates the target
+  -- output size, in which case the adj factors are computed automatically
+  if type(inputTensor) == 'table' then
+    inputTensor = input[1]
+    local targetTensor = input[2]
+    local tDims = targetTensor:dim()
+    local tH = targetTensor:size(tDims-1)
+    local tW = targetTensor:size(tDims)
+    adjW = calculateAdj(tW, self.kW, self.padW, self.dW)
+    adjH = calculateAdj(tH, self.kH, self.padH, self.dH)
+    self.finput = self.finput or input[1].new()
+    self.fgradInput = self.fgradInput or input[1].new()
+  else
+    self.finput = self.finput or input.new()
+    self.fgradInput = self.fgradInput or input.new()
+  end
+
+  inputTensor.THNN.SpatialFullConvolution_updateOutput(
+    inputTensor:cdata(),
+    self.output:cdata(),
+    self.weight:cdata(),
+    THNN.optionalTensor(self.bias),
+    self.finput:cdata(),
+    self.fgradInput:cdata(),
+    self.kW, self.kH,
+    self.dW, self.dH,
+    self.padW, self.padH,
+    adjW, adjH
+  )
+
+  return self.output
+end
+
+function SpatialFullConvolution:updateGradInput(input, gradOutput)
+  self:backCompatibility()
+
+  if self.gradInput then
+
+    local inputTensor = input
+    local adjW, adjH = self.adjW, self.adjH
+
+    -- The input can be a table where the second element indicates the target
+    -- output size, in which case the adj factors are computed automatically
+    if type(inputTensor) == 'table' then
+      inputTensor = input[1]
+      local targetTensor = input[2]
+      local tDims = targetTensor:dim()
+      local tH = targetTensor:size(tDims-1)
+      local tW = targetTensor:size(tDims)
+      adjW = calculateAdj(tW, self.kW, self.padW, self.dW)
+      adjH = calculateAdj(tH, self.kH, self.padH, self.dH)
+      -- Momentarily extract the gradInput tensor
+      if type(self.gradInput) == 'table' then
+        self.gradInput = self.gradInput[1] or inputTensor.new()
+      end
+    end
+
+    inputTensor.THNN.SpatialFullConvolution_updateGradInput(
+      inputTensor:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.weight:cdata(),
+      self.finput:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH,
+      adjW, adjH
+    )
+
+    if type(input) == 'table' then
+     -- Create a zero tensor to be expanded and used as gradInput[2].
+      self.zeroScalar = self.zeroScalar or input[2].new(1):zero()
+      self.ones:resize(input[2]:dim()):fill(1)
+      local zeroTensor =  self.zeroScalar
+          :view(table.unpack(self.ones:totable()))
+          :expandAs(input[2])
+      self.gradInput = {self.gradInput, zeroTensor}
+    end
+
+    return self.gradInput
+  end
+end
+
+function SpatialFullConvolution:accGradParameters(input, gradOutput, scale)
+  scale = scale or 1
+  self:backCompatibility()
+
+  local inputTensor = input
+  local adjW, adjH = self.adjW, self.adjH
+
+  -- The input can be a table where the second element indicates the target
+  -- output size, in which case the adj factors are computed automatically
+  if type(inputTensor) == 'table' then
+    inputTensor = input[1]
+    local targetTensor = input[2]
+    local tDims = targetTensor:dim()
+    local tH = targetTensor:size(tDims-1)
+    local tW = targetTensor:size(tDims)
+    adjW = calculateAdj(tW, self.kW, self.padW, self.dW)
+    adjH = calculateAdj(tH, self.kH, self.padH, self.dH)
+  end
+
+  inputTensor.THNN.SpatialFullConvolution_accGradParameters(
+    inputTensor:cdata(),
+    gradOutput:cdata(),
+    self.gradWeight:cdata(),
+    THNN.optionalTensor(self.gradBias),
+    self.finput:cdata(),
+    self.fgradInput:cdata(),
+    self.kW, self.kH,
+    self.dW, self.dH,
+    self.padW, self.padH,
+    adjW, adjH,
+    scale
+  )
+end
+
+function SpatialFullConvolution:type(type, tensorCache)
+  self.finput = self.finput and torch.Tensor()
+  self.fgradInput = self.fgradInput and torch.Tensor()
+  return parent.type(self, type, tensorCache)
+end
+
+function SpatialFullConvolution:__tostring__()
+  local s = string.format('%s(%d -> %d, %dx%d', torch.type(self),
+  self.nInputPlane, self.nOutputPlane, self.kW, self.kH)
+  if self.dW ~= 1 or self.dH ~= 1 or self.padW ~= 0 or self.padH ~= 0 then
+    s = s .. string.format(', %d,%d', self.dW, self.dH)
+  end
+  if (self.padW or self.padH) and (self.padW ~= 0 or self.padH ~= 0) then
+    s = s .. ', ' .. self.padW .. ',' .. self.padH
+  end
+  if (self.adjW or self.adjH) and (self.adjW ~= 0 or self.adjH ~= 0) then
+    s = s .. ', ' .. self.adjW .. ',' .. self.adjH
+  end
+  if self.bias then
+     return s .. ')'
+  else
+     return s .. ') without bias'
+ end
+end
+
+function SpatialFullConvolution:clearState()
+   nn.utils.clear(self, 'finput', 'fgradInput', '_input', '_gradOutput')
+   return parent.clearState(self)
+end
+
diff --git a/contrib/lua-torch/nn/SpatialFullConvolutionMap.lua b/contrib/lua-torch/nn/SpatialFullConvolutionMap.lua
new file mode 100644
index 000000000..008f5e7cf
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialFullConvolutionMap.lua
@@ -0,0 +1,91 @@
+local SpatialFullConvolutionMap, parent = torch.class('nn.SpatialFullConvolutionMap', 'nn.Module')
+
+function SpatialFullConvolutionMap:__init(conMatrix, kW, kH, dW, dH)
+   parent.__init(self)
+
+   dW = dW or 1
+   dH = dH or 1
+
+   self.kW = kW
+   self.kH = kH
+   self.dW = dW
+   self.dH = dH
+   self.connTable = conMatrix
+   self.nInputPlane = self.connTable:select(2,1):max()
+   self.nOutputPlane = self.connTable:select(2,2):max()
+
+   self.weight = torch.Tensor(self.connTable:size(1), kH, kW)
+   self.gradWeight = torch.Tensor(self.connTable:size(1), kH, kW)
+
+   self.bias = torch.Tensor(self.nOutputPlane)
+   self.gradBias = torch.Tensor(self.nOutputPlane)
+
+   self:reset()
+end
+
+function SpatialFullConvolutionMap:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+      self.weight:apply(function()
+			   return torch.uniform(-stdv, stdv)
+			end)
+      self.bias:apply(function()
+			 return torch.uniform(-stdv, stdv)
+		      end)
+   else
+      local ninp = torch.Tensor(self.nOutputPlane):zero()
+      for i=1,self.connTable:size(1) do ninp[self.connTable[i][2]] =  ninp[self.connTable[i][2]]+1 end
+      for k=1,self.connTable:size(1) do
+         stdv = 1/math.sqrt(self.kW*self.kH*ninp[self.connTable[k][2]])
+         self.weight:select(1,k):apply(function() return torch.uniform(-stdv,stdv) end)
+      end
+      for k=1,self.bias:size(1) do
+         stdv = 1/math.sqrt(self.kW*self.kH*ninp[k])
+         self.bias[k] = torch.uniform(-stdv,stdv)
+      end
+
+   end
+end
+
+function SpatialFullConvolutionMap:updateOutput(input)
+   input.THNN.SpatialFullConvolutionMap_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.weight:cdata(),
+      self.bias:cdata(),
+      self.connTable:cdata(),
+      self.nInputPlane,
+      self.nOutputPlane,
+      self.dW, self.dH
+   )
+   return self.output
+end
+
+function SpatialFullConvolutionMap:updateGradInput(input, gradOutput)
+   input.THNN.SpatialFullConvolutionMap_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.weight:cdata(),
+      self.bias:cdata(),
+      self.connTable:cdata(),
+      self.nInputPlane,
+      self.nOutputPlane,
+      self.dW, self.dH
+   )
+   return self.gradInput
+end
+
+function SpatialFullConvolutionMap:accGradParameters(input, gradOutput, scale)
+   input.THNN.SpatialFullConvolutionMap_accGradParameters(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradWeight:cdata(),
+      self.gradBias:cdata(),
+      self.connTable:cdata(),
+      self.nInputPlane,
+      self.nOutputPlane,
+      self.dW, self.dH,
+      scale or 1
+   )
+end
diff --git a/contrib/lua-torch/nn/SpatialLPPooling.lua b/contrib/lua-torch/nn/SpatialLPPooling.lua
new file mode 100644
index 000000000..49a8493cf
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialLPPooling.lua
@@ -0,0 +1,43 @@
+local SpatialLPPooling, parent = torch.class('nn.SpatialLPPooling', 'nn.Sequential')
+
+function SpatialLPPooling:__init(nInputPlane, pnorm, kW, kH, dW, dH)
+   parent.__init(self)
+
+   dW = dW or kW
+   dH = dH or kH
+
+   self.kW = kW
+   self.kH = kH
+   self.dW = dW
+   self.dH = dH
+
+   if pnorm == 2 then
+      self:add(nn.Square())
+   else
+      self:add(nn.Power(pnorm))
+   end
+   self:add(nn.SpatialAveragePooling(kW, kH, dW, dH))
+   self:add(nn.MulConstant(kW*kH))
+   if pnorm == 2 then
+      self:add(nn.Sqrt())
+   else
+      self:add(nn.Power(1/pnorm))
+   end
+end
+
+-- the module is a Sequential: by default, it'll try to learn the parameters
+-- of the sub sampler: we avoid that by redefining its methods.
+function SpatialLPPooling:reset()
+end
+
+function SpatialLPPooling:accGradParameters()
+end
+
+function SpatialLPPooling:accUpdateGradParameters()
+end
+
+function SpatialLPPooling:zeroGradParameters()
+end
+
+function SpatialLPPooling:updateParameters()
+end
diff --git a/contrib/lua-torch/nn/SpatialLogSoftMax.lua b/contrib/lua-torch/nn/SpatialLogSoftMax.lua
new file mode 100644
index 000000000..9c81d49e1
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialLogSoftMax.lua
@@ -0,0 +1,19 @@
+local SpatialLogSoftMax = torch.class('nn.SpatialLogSoftMax', 'nn.Module')
+
+function SpatialLogSoftMax:updateOutput(input)
+   input.THNN.LogSoftMax_updateOutput(
+      input:cdata(),
+      self.output:cdata()
+   )
+   return self.output
+end
+
+function SpatialLogSoftMax:updateGradInput(input, gradOutput)
+   input.THNN.LogSoftMax_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.output:cdata()
+   )
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/SpatialMaxPooling.lua b/contrib/lua-torch/nn/SpatialMaxPooling.lua
new file mode 100644
index 000000000..5c865c631
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialMaxPooling.lua
@@ -0,0 +1,94 @@
+local SpatialMaxPooling, parent = torch.class('nn.SpatialMaxPooling', 'nn.Module')
+
+function SpatialMaxPooling:__init(kW, kH, dW, dH, padW, padH)
+   parent.__init(self)
+
+   dW = dW or kW
+   dH = dH or kH
+
+   self.kW = kW
+   self.kH = kH
+   self.dW = dW
+   self.dH = dH
+
+   self.padW = padW or 0
+   self.padH = padH or 0
+
+   self.ceil_mode = false
+   self.indices = torch.LongTensor()
+end
+
+function SpatialMaxPooling:ceil()
+  self.ceil_mode = true
+  return self
+end
+
+function SpatialMaxPooling:floor()
+  self.ceil_mode = false
+  return self
+end
+
+function SpatialMaxPooling:updateOutput(input)
+   self.indices = self.indices or torch.LongTensor()
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
+      self.indices = torch.CudaLongTensor and self.indices:cudaLong() or self.indices
+   else
+      self.indices = self.indices:long()
+   end
+
+   local dims = input:dim()
+   self.iheight = input:size(dims-1)
+   self.iwidth = input:size(dims)
+
+   -- backward compatibility
+   self.ceil_mode = self.ceil_mode or false
+   self.padW = self.padW or 0
+   self.padH = self.padH or 0
+   input.THNN.SpatialMaxPooling_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.indices:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH,
+      self.ceil_mode
+   )
+   return self.output
+end
+
+function SpatialMaxPooling:updateGradInput(input, gradOutput)
+   input.THNN.SpatialMaxPooling_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.indices:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      self.padW, self.padH,
+      self.ceil_mode
+   )
+   return self.gradInput
+end
+
+-- for backward compat
+function SpatialMaxPooling:empty()
+   self:clearState()
+end
+
+function SpatialMaxPooling:__tostring__()
+   local s =  string.format('%s(%dx%d, %d,%d', torch.type(self),
+                            self.kW, self.kH, self.dW, self.dH)
+   if (self.padW or self.padH) and (self.padW ~= 0 or self.padH ~= 0) then
+      s = s .. ', ' .. self.padW .. ','.. self.padH
+   end
+   s = s .. ')'
+
+   return s
+end
+
+function SpatialMaxPooling:clearState()
+   if self.indices then
+      self.indices:set()
+   end
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/SpatialMaxUnpooling.lua b/contrib/lua-torch/nn/SpatialMaxUnpooling.lua
new file mode 100644
index 000000000..408bcc052
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialMaxUnpooling.lua
@@ -0,0 +1,45 @@
+local SpatialMaxUnpooling, parent = torch.class('nn.SpatialMaxUnpooling', 'nn.Module')
+
+function SpatialMaxUnpooling:__init(poolingModule)
+   parent.__init(self)
+   assert(torch.type(poolingModule)=='nn.SpatialMaxPooling', 'Argument must be a nn.SpatialMaxPooling module')
+   assert(poolingModule.kH==poolingModule.dH and poolingModule.kW==poolingModule.dW, "The size of pooling module's kernel must be equal to its stride")
+   self.pooling = poolingModule
+end
+
+function SpatialMaxUnpooling:setParams()
+   self.indices = self.pooling.indices
+   self.oheight = self.pooling.iheight
+   self.owidth = self.pooling.iwidth
+end
+
+function SpatialMaxUnpooling:updateOutput(input)
+   self:setParams()
+   input.THNN.SpatialMaxUnpooling_updateOutput(
+   input:cdata(),
+   self.output:cdata(),
+   self.indices:cdata(),
+   self.owidth, self.oheight
+   )
+   return self.output
+end
+
+function SpatialMaxUnpooling:updateGradInput(input, gradOutput)
+   self:setParams()
+   input.THNN.SpatialMaxUnpooling_updateGradInput(
+   input:cdata(),
+   gradOutput:cdata(),
+   self.gradInput:cdata(),
+   self.indices:cdata(),
+   self.owidth, self.oheight
+   )
+   return self.gradInput
+end
+
+function SpatialMaxUnpooling:empty()
+   self:clearState()
+end
+
+function SpatialMaxUnpooling:__tostring__()
+   return 'nn.SpatialMaxUnpooling associated to '..tostring(self.pooling)
+end
diff --git a/contrib/lua-torch/nn/SpatialReflectionPadding.lua b/contrib/lua-torch/nn/SpatialReflectionPadding.lua
new file mode 100644
index 000000000..9ce4612ad
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialReflectionPadding.lua
@@ -0,0 +1,51 @@
+local SpatialReflectionPadding, parent =
+   torch.class('nn.SpatialReflectionPadding', 'nn.Module')
+
+function SpatialReflectionPadding:__init(pad_l, pad_r, pad_t, pad_b)
+   parent.__init(self)
+   self.pad_l = pad_l
+   self.pad_r = pad_r or self.pad_l
+   self.pad_t = pad_t or self.pad_l
+   self.pad_b = pad_b or self.pad_l
+end
+
+function SpatialReflectionPadding:updateOutput(input)
+   if input:dim() == 3 or input:dim() == 4 then
+      input.THNN.SpatialReflectionPadding_updateOutput(
+         input:cdata(), self.output:cdata(),
+         self.pad_l, self.pad_r, self.pad_t, self.pad_b)
+   else
+      error('input must be 3 or 4-dimensional')
+   end
+   return self.output
+end
+
+function SpatialReflectionPadding:updateGradInput(input, gradOutput)
+   if input:dim() == 3 and gradOutput:dim() == 3 then
+      assert(input:size(1) == gradOutput:size(1)
+             and input:size(2) + self.pad_t + self.pad_b == gradOutput:size(2)
+             and input:size(3) + self.pad_l + self.pad_r == gradOutput:size(3),
+             'input and gradOutput must be compatible in size')
+   elseif input:dim() == 4 and gradOutput:dim() == 4 then
+      assert(input:size(1) == gradOutput:size(1)
+             and input:size(2) == gradOutput:size(2)
+             and input:size(3) + self.pad_t + self.pad_b == gradOutput:size(3)
+             and input:size(4) + self.pad_l + self.pad_r == gradOutput:size(4),
+             'input and gradOutput must be compatible in size')
+   else
+      error(
+        [[input and gradOutput must be 3 or 4-dimensional
+        and have equal number of dimensions]]
+        )
+   end
+   input.THNN.SpatialReflectionPadding_updateGradInput(
+      input:cdata(), gradOutput:cdata(), self.gradInput:cdata(),
+      self.pad_l, self.pad_r, self.pad_t, self.pad_b)
+   return self.gradInput
+end
+
+function SpatialReflectionPadding:__tostring__()
+  return torch.type(self) ..
+      string.format('(l=%d, r=%d, t=%d, b=%d)', self.pad_l, self.pad_r,
+                    self.pad_t, self.pad_b)
+end
diff --git a/contrib/lua-torch/nn/SpatialReplicationPadding.lua b/contrib/lua-torch/nn/SpatialReplicationPadding.lua
new file mode 100644
index 000000000..429763f9b
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialReplicationPadding.lua
@@ -0,0 +1,51 @@
+local SpatialReplicationPadding, parent =
+   torch.class('nn.SpatialReplicationPadding', 'nn.Module')
+
+function SpatialReplicationPadding:__init(pad_l, pad_r, pad_t, pad_b)
+   parent.__init(self)
+   self.pad_l = pad_l
+   self.pad_r = pad_r or self.pad_l
+   self.pad_t = pad_t or self.pad_l
+   self.pad_b = pad_b or self.pad_l
+end
+
+function SpatialReplicationPadding:updateOutput(input)
+   if input:dim() == 3 or input:dim() == 4 then
+      input.THNN.SpatialReplicationPadding_updateOutput(
+         input:cdata(), self.output:cdata(),
+         self.pad_l, self.pad_r, self.pad_t, self.pad_b)
+   else
+      error('input must be 3 or 4-dimensional')
+   end
+   return self.output
+end
+
+function SpatialReplicationPadding:updateGradInput(input, gradOutput)
+   if input:dim() == 3 and gradOutput:dim() == 3 then
+      assert(input:size(1) == gradOutput:size(1)
+             and input:size(2) + self.pad_t + self.pad_b == gradOutput:size(2)
+             and input:size(3) + self.pad_l + self.pad_r == gradOutput:size(3),
+             'input and gradOutput must be compatible in size')
+   elseif input:dim() == 4 and gradOutput:dim() == 4 then
+      assert(input:size(1) == gradOutput:size(1)
+             and input:size(2) == gradOutput:size(2)
+             and input:size(3) + self.pad_t + self.pad_b == gradOutput:size(3)
+             and input:size(4) + self.pad_l + self.pad_r == gradOutput:size(4),
+             'input and gradOutput must be compatible in size')
+   else
+      error(
+         [[input and gradOutput must be 3 or 4-dimensional
+         and have equal number of dimensions]]
+         )
+   end
+   input.THNN.SpatialReplicationPadding_updateGradInput(
+      input:cdata(), gradOutput:cdata(), self.gradInput:cdata(),
+      self.pad_l, self.pad_r, self.pad_t, self.pad_b)
+   return self.gradInput
+end
+
+function SpatialReplicationPadding:__tostring__()
+   return torch.type(self) ..
+   string.format('(l=%d, r=%d, t=%d, b=%d)', self.pad_l, self.pad_r,
+   self.pad_t, self.pad_b)
+end
diff --git a/contrib/lua-torch/nn/SpatialSoftMax.lua b/contrib/lua-torch/nn/SpatialSoftMax.lua
new file mode 100644
index 000000000..56f0b40e2
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialSoftMax.lua
@@ -0,0 +1,19 @@
+local SpatialSoftMax, _ = torch.class('nn.SpatialSoftMax', 'nn.Module')
+
+function SpatialSoftMax:updateOutput(input)
+   input.THNN.SoftMax_updateOutput(
+      input:cdata(),
+      self.output:cdata()
+   )
+   return self.output
+end
+
+function SpatialSoftMax:updateGradInput(input, gradOutput)
+   input.THNN.SoftMax_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.output:cdata()
+   )
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/SpatialSubSampling.lua b/contrib/lua-torch/nn/SpatialSubSampling.lua
new file mode 100644
index 000000000..4e3fb8881
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialSubSampling.lua
@@ -0,0 +1,79 @@
+local SpatialSubSampling, parent = torch.class('nn.SpatialSubSampling', 'nn.Module')
+
+function SpatialSubSampling:__init(nInputPlane, kW, kH, dW, dH)
+   parent.__init(self)
+
+   dW = dW or 1
+   dH = dH or 1
+
+   self.nInputPlane = nInputPlane
+   self.kW = kW
+   self.kH = kH
+   self.dW = dW
+   self.dH = dH
+
+   self.weight = torch.Tensor(nInputPlane)
+   self.bias = torch.Tensor(nInputPlane)
+   self.gradWeight = torch.Tensor(nInputPlane)
+   self.gradBias = torch.Tensor(nInputPlane)
+
+   self:reset()
+end
+
+function SpatialSubSampling:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1/math.sqrt(self.kW*self.kH)
+   end
+   if nn.oldSeed then
+      self.weight:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end)
+      self.bias:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end)
+   else
+      self.weight:uniform(-stdv, stdv)
+      self.bias:uniform(-stdv, stdv)
+   end
+end
+
+function SpatialSubSampling:updateOutput(input)
+   input.THNN.SpatialSubSampling_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.weight:cdata(),
+      self.bias:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH
+   )
+   return self.output
+end
+
+function SpatialSubSampling:updateGradInput(input, gradOutput)
+   if self.gradInput then
+      input.THNN.SpatialSubSampling_updateGradInput(
+         input:cdata(),
+         gradOutput:cdata(),
+         self.gradInput:cdata(),
+         self.weight:cdata(),
+         self.kW, self.kH,
+         self.dW, self.dH
+      )
+      return self.gradInput
+   end
+end
+
+function SpatialSubSampling:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   input.THNN.SpatialSubSampling_accGradParameters(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradWeight:cdata(),
+      self.gradBias:cdata(),
+      self.kW, self.kH,
+      self.dW, self.dH,
+      scale
+   )
+end
diff --git a/contrib/lua-torch/nn/SpatialSubtractiveNormalization.lua b/contrib/lua-torch/nn/SpatialSubtractiveNormalization.lua
new file mode 100644
index 000000000..d430083e9
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialSubtractiveNormalization.lua
@@ -0,0 +1,115 @@
+local SpatialSubtractiveNormalization, parent = torch.class('nn.SpatialSubtractiveNormalization','nn.Module')
+
+function SpatialSubtractiveNormalization:__init(nInputPlane, kernel)
+   parent.__init(self)
+
+   -- get args
+   self.nInputPlane = nInputPlane or 1
+   self.kernel = kernel or torch.Tensor(9,9):fill(1)
+   local kdim = self.kernel:nDimension()
+
+   -- check args
+   if kdim ~= 2 and kdim ~= 1 then
+      error('<SpatialSubtractiveNormalization> averaging kernel must be 2D or 1D')
+   end
+   if (self.kernel:size(1) % 2) == 0 or (kdim == 2 and (self.kernel:size(2) % 2) == 0) then
+      error('<SpatialSubtractiveNormalization> averaging kernel must have ODD dimensions')
+   end
+
+   -- normalize kernel
+   self.kernel:div(self.kernel:sum() * self.nInputPlane)
+
+   -- padding values
+   local padH = math.floor(self.kernel:size(1)/2)
+   local padW = padH
+   if kdim == 2 then
+      padW = math.floor(self.kernel:size(2)/2)
+   end
+
+   -- create convolutional mean extractor
+   self.meanestimator = nn.Sequential()
+   self.meanestimator:add(nn.SpatialZeroPadding(padW, padW, padH, padH))
+   if kdim == 2 then
+      self.meanestimator:add(nn.SpatialConvolution(self.nInputPlane, 1, self.kernel:size(2), self.kernel:size(1)))
+   else
+      self.meanestimator:add(nn.SpatialConvolutionMap(nn.tables.oneToOne(self.nInputPlane), self.kernel:size(1), 1))
+      self.meanestimator:add(nn.SpatialConvolution(self.nInputPlane, 1, 1, self.kernel:size(1)))
+   end
+   self.meanestimator:add(nn.Replicate(self.nInputPlane,1,3))
+
+   -- set kernel and bias
+   if kdim == 2 then
+      for i = 1,self.nInputPlane do
+         self.meanestimator.modules[2].weight[1][i] = self.kernel
+      end
+      self.meanestimator.modules[2].bias:zero()
+   else
+      for i = 1,self.nInputPlane do
+         self.meanestimator.modules[2].weight[i]:copy(self.kernel)
+         self.meanestimator.modules[3].weight[1][i]:copy(self.kernel)
+      end
+      self.meanestimator.modules[2].bias:zero()
+      self.meanestimator.modules[3].bias:zero()
+   end
+
+   -- other operation
+   self.subtractor = nn.CSubTable()
+   self.divider = nn.CDivTable()
+
+   -- coefficient array, to adjust side effects
+   self.coef = torch.Tensor(1,1,1)
+end
+
+function SpatialSubtractiveNormalization:updateOutput(input)
+   -- compute side coefficients
+   local dim = input:dim()
+   if input:dim()+1 ~= self.coef:dim() or (input:size(dim) ~= self.coef:size(dim)) or (input:size(dim-1) ~= self.coef:size(dim-1)) then
+      self.ones = self.ones or input.new()
+      self._coef = self._coef or self.coef.new()
+      if dim == 4 then
+         -- batch mode
+         self.ones:resizeAs(input[1]):fill(1)
+         local coef = self.meanestimator:updateOutput(self.ones)
+         self._coef:resizeAs(coef):copy(coef) -- make contiguous for view
+         local size = coef:size():totable()
+         table.insert(size,1,input:size(1))
+         self.coef = self._coef:view(1,table.unpack(self._coef:size():totable())):expand(table.unpack(size))
+      else
+         self.ones:resizeAs(input):fill(1)
+         local coef = self.meanestimator:updateOutput(self.ones)
+         self._coef:resizeAs(coef):copy(coef) -- copy meanestimator.output as it will be used below
+         self.coef = self._coef
+      end
+
+   end
+
+   -- compute mean
+   self.localsums = self.meanestimator:updateOutput(input)
+   self.adjustedsums = self.divider:updateOutput{self.localsums, self.coef}
+   self.output = self.subtractor:updateOutput{input, self.adjustedsums}
+
+   -- done
+   return self.output
+end
+
+function SpatialSubtractiveNormalization:updateGradInput(input, gradOutput)
+   -- resize grad
+   self.gradInput:resizeAs(input):zero()
+
+   -- backprop through all modules
+   local gradsub = self.subtractor:updateGradInput({input, self.adjustedsums}, gradOutput)
+   local graddiv = self.divider:updateGradInput({self.localsums, self.coef}, gradsub[2])
+   local size = self.meanestimator:updateGradInput(input, graddiv[1]):size()
+   self.gradInput:add(self.meanestimator:updateGradInput(input, graddiv[1]))
+   self.gradInput:add(gradsub[1])
+
+   -- done
+   return self.gradInput
+end
+
+function SpatialSubtractiveNormalization:clearState()
+   if self.ones then self.ones:set() end
+   if self._coef then self._coef:set() end
+   self.meanestimator:clearState()
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/SpatialUpSamplingBilinear.lua b/contrib/lua-torch/nn/SpatialUpSamplingBilinear.lua
new file mode 100644
index 000000000..12e1ce8f2
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialUpSamplingBilinear.lua
@@ -0,0 +1,139 @@
+require 'nn.THNN'
+local SpatialUpSamplingBilinear, parent =
+   torch.class('nn.SpatialUpSamplingBilinear', 'nn.Module')
+
+--[[
+Applies a 2D bilinear up-sampling over an input image composed of several
+input planes.
+
+The Y and X dimensions are assumed to be the last 2 tensor dimensions.  For
+instance, if the tensor is 4D, then dim 3 is the y dimension and dim 4 is the x.
+
+scale_factor is assumed to be a positive integer.
+owidth  = (width-1)*(scale_factor-1) + width
+oheight  = (height-1)*(scale_factor-1) + height
+
+Alternatively, owidth and oheight can be directly provided as input.
+--]]
+
+function SpatialUpSamplingBilinear:__init(params)
+   parent.__init(self)
+
+   self.owidth, self.oheight, self.scale_factor = nil, nil, nil
+   if torch.type(params) == 'table' then
+      self.owidth, self.oheight = params.owidth, params.oheight
+   else
+      self.scale_factor = params
+      if self.scale_factor < 1 then
+         error('scale_factor must be greater than 1')
+      end
+      if math.floor(self.scale_factor) ~= self.scale_factor then
+         error('scale_factor must be integer')
+      end
+   end
+   self.inputSize = torch.LongStorage(4)
+   self.outputSize = torch.LongStorage(4)
+end
+
+local function makeContiguous(self, input, gradOutput)
+   if not input:isContiguous() then
+      self._input = self._input or input.new()
+      self._input:resizeAs(input):copy(input)
+      input = self._input
+   end
+   if gradOutput then
+      if not gradOutput:isContiguous() then
+         self._gradOutput = self._gradOutput or gradOutput.new()
+         self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
+         gradOutput = self._gradOutput
+      end
+   end
+   return input, gradOutput
+end
+
+function SpatialUpSamplingBilinear:setSize(input)
+   local xdim = input:dim()
+   local ydim = xdim - 1
+   for i = 1, input:dim() do
+      self.inputSize[i] = input:size(i)
+      self.outputSize[i] = input:size(i)
+   end
+   if self.scale_factor ~= nil then
+      self.outputSize[ydim] = self.outputSize[ydim] * self.scale_factor
+      self.outputSize[xdim] = self.outputSize[xdim] * self.scale_factor
+   else
+      self.outputSize[ydim] = self.oheight
+      self.outputSize[xdim] = self.owidth
+   end
+end
+
+function SpatialUpSamplingBilinear:updateOutput(input)
+   assert(input:dim() == 4 or input:dim()==3,
+            'SpatialUpSamplingBilinear only supports 3D or 4D tensors' )
+   input = makeContiguous(self, input)
+   local inputwas3D = false
+   if input:dim() == 3 then
+      input=input:view(-1, input:size(1), input:size(2), input:size(3))
+      inputwas3D = true
+   end
+   local xdim = input:dim()
+   local ydim = xdim - 1
+   self:setSize(input)
+   input.THNN.SpatialUpSamplingBilinear_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.outputSize[ydim],
+      self.outputSize[xdim]
+   )
+   if inputwas3D then
+      input = input:squeeze(1)
+      self.output = self.output:squeeze(1)
+   end
+   return self.output
+end
+
+function SpatialUpSamplingBilinear:updateGradInput(input, gradOutput)
+   assert(input:dim() == 4 or input:dim()==3,
+            'SpatialUpSamplingBilinear only support 3D or 4D tensors' )
+   assert(input:dim() == gradOutput:dim(),
+	  'Input and gradOutput should be of same dimension' )
+   input, gradOutput = makeContiguous(self, input, gradOutput)
+   local inputwas3D = false
+   if input:dim() == 3 then
+      input = input:view(-1, input:size(1), input:size(2), input:size(3))
+      gradOutput = gradOutput:view(-1, gradOutput:size(1), gradOutput:size(2),
+				   gradOutput:size(3))
+      inputwas3D = true
+   end
+   local xdim = input:dim()
+   local ydim = xdim - 1
+   self.gradInput:resizeAs(input)
+   input.THNN.SpatialUpSamplingBilinear_updateGradInput(
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      input:size(1),
+      input:size(2),
+      input:size(3),
+      input:size(4),
+      self.outputSize[ydim],
+      self.outputSize[xdim]
+   )
+   if inputwas3D then
+      input = input:squeeze(1)
+      gradOutput = gradOutput:squeeze(1)
+      self.gradInput = self.gradInput:squeeze(1)
+   end
+   return self.gradInput
+end
+
+
+function SpatialUpSamplingBilinear:__tostring__()
+   local s
+   if self.scale_factor ~= nil then
+      s = string.format('%s(%d)', torch.type(self), self.scale_factor)
+   else
+      s = string.format('%s(%d, %d)',
+         torch.type(self), self.oheight, self.owidth)
+   end
+   return s
+end
diff --git a/contrib/lua-torch/nn/SpatialUpSamplingNearest.lua b/contrib/lua-torch/nn/SpatialUpSamplingNearest.lua
new file mode 100644
index 000000000..362ae73a3
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialUpSamplingNearest.lua
@@ -0,0 +1,59 @@
+local SpatialUpSamplingNearest, parent = torch.class('nn.SpatialUpSamplingNearest', 'nn.Module')
+
+--[[
+Applies a 2D up-sampling over an input image composed of several input planes.
+
+The upsampling is done using the simple nearest neighbor technique.
+
+The Y and X dimensions are assumed to be the last 2 tensor dimensions.  For
+instance, if the tensor is 4D, then dim 3 is the y dimension and dim 4 is the x.
+
+owidth  = width*scale_factor
+oheight  = height*scale_factor
+--]]
+
+function SpatialUpSamplingNearest:__init(scale)
+   parent.__init(self)
+
+   self.scale_factor = scale
+   if self.scale_factor < 1 then
+     error('scale_factor must be greater than 1')
+   end
+   if math.floor(self.scale_factor) ~= self.scale_factor then
+     error('scale_factor must be integer')
+   end
+   self.inputSize = torch.LongStorage(4)
+   self.outputSize = torch.LongStorage(4)
+end
+
+function SpatialUpSamplingNearest:updateOutput(input)
+   if input:dim() ~= 4 and input:dim() ~= 3 then
+     error('SpatialUpSamplingNearest only support 3D or 4D tensors')
+   end
+   -- Copy the input size
+   local xdim = input:dim()
+   local ydim = input:dim() - 1
+   for i = 1, input:dim() do
+     self.inputSize[i] = input:size(i)
+     self.outputSize[i] = input:size(i)
+   end
+   self.outputSize[ydim] = self.outputSize[ydim] * self.scale_factor
+   self.outputSize[xdim] = self.outputSize[xdim] * self.scale_factor
+   input.THNN.SpatialUpSamplingNearest_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.scale_factor
+   )
+   return self.output
+end
+
+function SpatialUpSamplingNearest:updateGradInput(input, gradOutput)
+   self.gradInput:resizeAs(input)
+   input.THNN.SpatialUpSamplingNearest_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.scale_factor
+   )
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/SpatialZeroPadding.lua b/contrib/lua-torch/nn/SpatialZeroPadding.lua
new file mode 100644
index 000000000..f19925841
--- /dev/null
+++ b/contrib/lua-torch/nn/SpatialZeroPadding.lua
@@ -0,0 +1,104 @@
+local SpatialZeroPadding, parent = torch.class('nn.SpatialZeroPadding', 'nn.Module')
+
+function SpatialZeroPadding:__init(pad_l, pad_r, pad_t, pad_b)
+   parent.__init(self)
+   self.pad_l = pad_l
+   self.pad_r = pad_r or self.pad_l
+   self.pad_t = pad_t or self.pad_l
+   self.pad_b = pad_b or self.pad_l
+end
+
+function SpatialZeroPadding:updateOutput(input)
+   if input:dim() == 3 then
+      -- sizes
+      local h = input:size(2) + self.pad_t + self.pad_b
+      local w = input:size(3) + self.pad_l + self.pad_r
+      if w < 1 or h < 1 then error('input is too small') end
+      self.output:resize(input:size(1), h, w)
+      self.output:zero()
+      -- crop input if necessary
+      local c_input = input
+      if self.pad_t < 0 then c_input = c_input:narrow(2, 1 - self.pad_t, c_input:size(2) + self.pad_t) end
+      if self.pad_b < 0 then c_input = c_input:narrow(2, 1, c_input:size(2) + self.pad_b) end
+      if self.pad_l < 0 then c_input = c_input:narrow(3, 1 - self.pad_l, c_input:size(3) + self.pad_l) end
+      if self.pad_r < 0 then c_input = c_input:narrow(3, 1, c_input:size(3) + self.pad_r) end
+      -- crop outout if necessary
+      local c_output = self.output
+      if self.pad_t > 0 then c_output = c_output:narrow(2, 1 + self.pad_t, c_output:size(2) - self.pad_t) end
+      if self.pad_b > 0 then c_output = c_output:narrow(2, 1, c_output:size(2) - self.pad_b) end
+      if self.pad_l > 0 then c_output = c_output:narrow(3, 1 + self.pad_l, c_output:size(3) - self.pad_l) end
+      if self.pad_r > 0 then c_output = c_output:narrow(3, 1, c_output:size(3) - self.pad_r) end
+      -- copy input to output
+      c_output:copy(c_input)
+   elseif input:dim() == 4 then
+      -- sizes
+      local h = input:size(3) + self.pad_t + self.pad_b
+      local w = input:size(4) + self.pad_l + self.pad_r
+      if w < 1 or h < 1 then error('input is too small') end
+      self.output:resize(input:size(1), input:size(2), h, w)
+      self.output:zero()
+      -- crop input if necessary
+      local c_input = input
+      if self.pad_t < 0 then c_input = c_input:narrow(3, 1 - self.pad_t, c_input:size(3) + self.pad_t) end
+      if self.pad_b < 0 then c_input = c_input:narrow(3, 1, c_input:size(3) + self.pad_b) end
+      if self.pad_l < 0 then c_input = c_input:narrow(4, 1 - self.pad_l, c_input:size(4) + self.pad_l) end
+      if self.pad_r < 0 then c_input = c_input:narrow(4, 1, c_input:size(4) + self.pad_r) end
+      -- crop outout if necessary
+      local c_output = self.output
+      if self.pad_t > 0 then c_output = c_output:narrow(3, 1 + self.pad_t, c_output:size(3) - self.pad_t) end
+      if self.pad_b > 0 then c_output = c_output:narrow(3, 1, c_output:size(3) - self.pad_b) end
+      if self.pad_l > 0 then c_output = c_output:narrow(4, 1 + self.pad_l, c_output:size(4) - self.pad_l) end
+      if self.pad_r > 0 then c_output = c_output:narrow(4, 1, c_output:size(4) - self.pad_r) end
+      -- copy input to output
+      c_output:copy(c_input)
+   else
+      error('input must be 3 or 4-dimensional')
+   end
+   return self.output
+end
+
+function SpatialZeroPadding:updateGradInput(input, gradOutput)
+   if input:dim() == 3 then
+      self.gradInput:resizeAs(input):zero()
+      -- crop gradInput if necessary
+      local cg_input = self.gradInput
+      if self.pad_t < 0 then cg_input = cg_input:narrow(2, 1 - self.pad_t, cg_input:size(2) + self.pad_t) end
+      if self.pad_b < 0 then cg_input = cg_input:narrow(2, 1, cg_input:size(2) + self.pad_b) end
+      if self.pad_l < 0 then cg_input = cg_input:narrow(3, 1 - self.pad_l, cg_input:size(3) + self.pad_l) end
+      if self.pad_r < 0 then cg_input = cg_input:narrow(3, 1, cg_input:size(3) + self.pad_r) end
+      -- crop gradOutout if necessary
+      local cg_output = gradOutput
+      if self.pad_t > 0 then cg_output = cg_output:narrow(2, 1 + self.pad_t, cg_output:size(2) - self.pad_t) end
+      if self.pad_b > 0 then cg_output = cg_output:narrow(2, 1, cg_output:size(2) - self.pad_b) end
+      if self.pad_l > 0 then cg_output = cg_output:narrow(3, 1 + self.pad_l, cg_output:size(3) - self.pad_l) end
+      if self.pad_r > 0 then cg_output = cg_output:narrow(3, 1, cg_output:size(3) - self.pad_r) end
+      -- copy gradOuput to gradInput
+      cg_input:copy(cg_output)
+   elseif input:dim() == 4 then
+      self.gradInput:resizeAs(input):zero()
+      -- crop gradInput if necessary
+      local cg_input = self.gradInput
+      if self.pad_t < 0 then cg_input = cg_input:narrow(3, 1 - self.pad_t, cg_input:size(3) + self.pad_t) end
+      if self.pad_b < 0 then cg_input = cg_input:narrow(3, 1, cg_input:size(3) + self.pad_b) end
+      if self.pad_l < 0 then cg_input = cg_input:narrow(4, 1 - self.pad_l, cg_input:size(4) + self.pad_l) end
+      if self.pad_r < 0 then cg_input = cg_input:narrow(4, 1, cg_input:size(4) + self.pad_r) end
+      -- crop gradOutout if necessary
+      local cg_output = gradOutput
+      if self.pad_t > 0 then cg_output = cg_output:narrow(3, 1 + self.pad_t, cg_output:size(3) - self.pad_t) end
+      if self.pad_b > 0 then cg_output = cg_output:narrow(3, 1, cg_output:size(3) - self.pad_b) end
+      if self.pad_l > 0 then cg_output = cg_output:narrow(4, 1 + self.pad_l, cg_output:size(4) - self.pad_l) end
+      if self.pad_r > 0 then cg_output = cg_output:narrow(4, 1, cg_output:size(4) - self.pad_r) end
+      -- copy gradOuput to gradInput
+      cg_input:copy(cg_output)
+   else
+      error('input must be 3 or 4-dimensional')
+   end
+   return self.gradInput
+end
+
+
+function SpatialZeroPadding:__tostring__()
+   return torch.type(self) ..
+   string.format('(l=%d, r=%d, t=%d, b=%d)', self.pad_l, self.pad_r,
+   self.pad_t, self.pad_b)
+end
diff --git a/contrib/lua-torch/nn/SplitTable.lua b/contrib/lua-torch/nn/SplitTable.lua
new file mode 100644
index 000000000..7c4f968e6
--- /dev/null
+++ b/contrib/lua-torch/nn/SplitTable.lua
@@ -0,0 +1,43 @@
+local SplitTable, parent = torch.class('nn.SplitTable', 'nn.Module')
+
+function SplitTable:__init(dimension, nInputDims)
+   parent.__init(self)
+   self.dimension = dimension
+   self.nInputDims = nInputDims
+end
+
+function SplitTable:_getPositiveDimension(input)
+   local dimension = self.dimension
+   if dimension < 0 then
+      dimension = input:dim() + dimension + 1
+   elseif self.nInputDims and input:dim()==(self.nInputDims+1) then
+      dimension = dimension + 1
+   end
+   return dimension
+end
+
+function SplitTable:updateOutput(input)
+   local dimension = self:_getPositiveDimension(input)
+   local slices = input:size(dimension)
+
+   local currentOutput= {}
+   for i=1,slices do
+      currentOutput[#currentOutput+1] = input:select(dimension,i)
+   end
+   self.output = currentOutput
+   return self.output
+end
+
+function SplitTable:updateGradInput(input, gradOutput)
+   local dimension = self:_getPositiveDimension(input)
+   local slices = input:size(dimension)
+   if self.gradInput then
+      self.gradInput:resizeAs(input)
+
+      for i=1,slices do
+         local currentGradInput = gradOutput[i];
+         self.gradInput:select(dimension,i):copy(currentGradInput)
+      end
+   end
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/Sqrt.lua b/contrib/lua-torch/nn/Sqrt.lua
new file mode 100644
index 000000000..df354a175
--- /dev/null
+++ b/contrib/lua-torch/nn/Sqrt.lua
@@ -0,0 +1,26 @@
+local Sqrt, parent = torch.class('nn.Sqrt','nn.Module')
+
+function Sqrt:__init(b)
+   parent.__init(self)
+   self.eps = b or 0
+end
+
+function Sqrt:updateOutput(input)
+   self.eps = self.eps or 0
+   input.THNN.Sqrt_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.eps
+   )
+   return self.output
+end
+
+function Sqrt:updateGradInput(input, gradOutput)
+   input.THNN.Sqrt_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.output:cdata()
+   )
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/Square.lua b/contrib/lua-torch/nn/Square.lua
new file mode 100644
index 000000000..a6292afb9
--- /dev/null
+++ b/contrib/lua-torch/nn/Square.lua
@@ -0,0 +1,22 @@
+local Square, parent = torch.class('nn.Square', 'nn.Module')
+
+function Square:__init(args)
+   parent.__init(self)
+end
+
+function Square:updateOutput(input)
+   input.THNN.Square_updateOutput(
+      input:cdata(),
+      self.output:cdata()
+   )
+   return self.output
+end
+
+function Square:updateGradInput(input, gradOutput)
+   input.THNN.Square_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata()
+   )
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/Squeeze.lua b/contrib/lua-torch/nn/Squeeze.lua
new file mode 100644
index 000000000..7d204a19d
--- /dev/null
+++ b/contrib/lua-torch/nn/Squeeze.lua
@@ -0,0 +1,40 @@
+local Squeeze, parent = torch.class('nn.Squeeze', 'nn.Module')
+
+function Squeeze:__init(dim, numInputDims)
+    parent.__init(self)
+    self.dim = dim
+    self:setNumInputDims(numInputDims)
+end
+
+function Squeeze:setNumInputDims(numInputDims)
+   self.numInputDims = numInputDims
+   return self
+end
+
+function Squeeze:updateOutput(input)
+    assert(input and torch.isTensor(input), 'Squeeze only works on tensors')
+    local dim    = self.dim
+    local addone = false
+    if self.numInputDims and input:dim()==(self.numInputDims+1) then
+        if dim then
+            dim = dim + 1
+        elseif input:size(1) == 1 then
+            addone = true -- in case of minibatch of size 1.
+        end
+    end
+    self.output:set(dim and input:squeeze(dim) or input:squeeze())
+    if addone then
+        local s = self.output:size():totable{}
+        table.insert(s, 1, 1)
+        self.output:set(self.output:view(torch.LongStorage(s)))
+    end
+    return self.output
+end
+
+function Squeeze:updateGradInput(input, gradOutput)
+    assert(input and torch.isTensor(input), 'Squeeze only works on tensors')
+    assert(gradOutput and torch.isTensor(gradOutput), 'Squeeze only works on tensors')
+    assert(input:nElement() == gradOutput:nElement())
+    self.gradInput:set(gradOutput:view(input:size()))
+    return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/StochasticGradient.lua b/contrib/lua-torch/nn/StochasticGradient.lua
new file mode 100644
index 000000000..a060371e8
--- /dev/null
+++ b/contrib/lua-torch/nn/StochasticGradient.lua
@@ -0,0 +1,62 @@
+local StochasticGradient = torch.class('nn.StochasticGradient')
+
+function StochasticGradient:__init(module, criterion)
+   self.learningRate = 0.01
+   self.learningRateDecay = 0
+   self.maxIteration = 25
+   self.shuffleIndices = true
+   self.module = module
+   self.criterion = criterion
+   self.verbose = true
+end
+
+function StochasticGradient:train(dataset)
+   local iteration = 1
+   local currentLearningRate = self.learningRate
+   local module = self.module
+   local criterion = self.criterion
+
+   local shuffledIndices = torch.randperm(dataset:size(), 'torch.LongTensor')
+   if not self.shuffleIndices then
+      for t = 1,dataset:size() do
+         shuffledIndices[t] = t
+      end
+   end
+
+   print("# StochasticGradient: training")
+
+   while true do
+      local currentError = 0
+      for t = 1,dataset:size() do
+         local example = dataset[shuffledIndices[t]]
+         local input = example[1]
+         local target = example[2]
+
+         currentError = currentError + criterion:forward(module:forward(input), target)
+
+         module:updateGradInput(input, criterion:updateGradInput(module.output, target))
+         module:accUpdateGradParameters(input, criterion.gradInput, currentLearningRate)
+
+         if self.hookExample then
+            self.hookExample(self, example)
+         end
+      end
+
+      currentError = currentError / dataset:size()
+
+      if self.hookIteration then
+         self.hookIteration(self, iteration, currentError)
+      end
+
+      if self.verbose then
+         print("# current error = " .. currentError)
+      end
+      iteration = iteration + 1
+      currentLearningRate = self.learningRate/(1+iteration*self.learningRateDecay)
+      if self.maxIteration > 0 and iteration > self.maxIteration then
+         print("# StochasticGradient: you have reached the maximum number of iterations")
+         print("# training error = " .. currentError)
+         break
+      end
+   end
+end
diff --git a/contrib/lua-torch/nn/Sum.lua b/contrib/lua-torch/nn/Sum.lua
new file mode 100644
index 000000000..7fe8a1ab8
--- /dev/null
+++ b/contrib/lua-torch/nn/Sum.lua
@@ -0,0 +1,67 @@
+local Sum, parent = torch.class('nn.Sum', 'nn.Module')
+
+function Sum:__init(dimension, nInputDims, sizeAverage, squeeze)
+   parent.__init(self)
+   self.dimension   = dimension or 1
+   -- do not assign default value to nInputDims or it will break backward compatibility
+   self.nInputDims  = nInputDims
+   self.sizeAverage = sizeAverage or false
+   if squeeze ~= nil then
+      assert(type(squeeze) == 'boolean', 'squeeze has to be true/false')
+      self.squeeze = squeeze
+   else
+      self.squeeze = true
+   end
+end
+
+function Sum:_getPositiveDimension(input)
+   local dimension = self.dimension
+   if dimension < 0 then
+      dimension = input:dim() + dimension + 1
+   elseif self.nInputDims and input:dim()==(self.nInputDims+1) then
+      dimension = dimension + 1
+   end
+   assert(input:dim() >= dimension, "dimension exceeds input dimensions")
+   return dimension
+end
+
+function Sum:updateOutput(input)
+   local dimension = self:_getPositiveDimension(input)
+   if type(self.output) == 'number' then
+      self.output = input.new()
+   end
+   self.output:sum(input, dimension)
+   if self.sizeAverage then
+      self.output:div(input:size(dimension))
+   end
+   if (self.squeeze == nil or self.squeeze) and self.output:nDimension() > 1 then
+      self.output:set(self.output:select(dimension, 1))
+   end
+   return self.output
+end
+
+function Sum:updateGradInput(input, gradOutput)
+   local dimension = self:_getPositiveDimension(input)
+   -- zero-strides don't work with MKL/BLAS, so
+   -- don't set self.gradInput to zero-stride tensor.
+   -- Instead, do a deepcopy
+   local size      = input:size()
+   size[dimension] = 1
+   if not gradOutput:isContiguous() then
+      self._gradOutput = self._gradOutput or gradOutput.new()
+      self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
+      gradOutput = self._gradOutput
+   end
+   gradOutput      = gradOutput:view(size)
+   self.gradInput:resizeAs(input)
+   self.gradInput:copy(gradOutput:expandAs(input))
+   if self.sizeAverage then
+      self.gradInput:div(input:size(dimension))
+   end
+   return self.gradInput
+end
+
+function Sum:clearState()
+   nn.utils.clear(self, '_gradOutput')
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/THNN.lua b/contrib/lua-torch/nn/THNN.lua
new file mode 100644
index 000000000..0848e9ed2
--- /dev/null
+++ b/contrib/lua-torch/nn/THNN.lua
@@ -0,0 +1,140 @@
+local ffi = require 'ffi'
+
+local THNN = {}
+
+
+local generic_THNN_h = require 'nn.THNN_h'
+-- strip all lines starting with #
+-- to remove preprocessor directives originally present
+-- in THNN.h
+generic_THNN_h = generic_THNN_h:gsub("\n#[^\n]*", "")
+generic_THNN_h = generic_THNN_h:gsub("^#[^\n]*\n", "")
+
+-- THGenerator struct declaration copied from torch7/lib/TH/THRandom.h
+local base_declarations = [[
+typedef void THNNState;
+
+typedef struct {
+  unsigned long the_initial_seed;
+  int left;
+  int seeded;
+  unsigned long next;
+  unsigned long state[624]; /* the array for the state vector 624 = _MERSENNE_STATE_N  */
+  double normal_x;
+  double normal_y;
+  double normal_rho;
+  int normal_is_valid;
+} THGenerator;
+]]
+
+-- polyfill for LUA 5.1
+if not package.searchpath then
+   local sep = package.config:sub(1,1)
+   function package.searchpath(mod, path)
+      mod = mod:gsub('%.', sep)
+      for m in path:gmatch('[^;]+') do
+         local nm = m:gsub('?', mod)
+         local f = io.open(nm, 'r')
+         if f then
+            f:close()
+            return nm
+         end
+     end
+   end
+end
+
+-- load libTHNN
+THNN.C = ffi.load(package.searchpath('libTHNN', package.cpath))
+
+ffi.cdef(base_declarations)
+
+-- expand macros, allow to use original lines from lib/THNN/generic/THNN.h
+local preprocessed = string.gsub(generic_THNN_h, 'TH_API void THNN_%(([%a%d_]+)%)', 'void THNN_TYPE%1')
+
+local replacements =
+{
+   {
+      ['TYPE'] = 'Double',
+      ['accreal'] = 'double',
+      ['THTensor'] = 'THDoubleTensor',
+      ['THIndexTensor'] = 'THLongTensor',
+      ['THIntegerTensor'] = 'THIntTensor',
+      ['THIndex_t'] = 'long',
+      ['THInteger_t'] = 'int'
+   },
+   {
+      ['TYPE'] = 'Float',
+      ['accreal'] = 'double',
+      ['THTensor'] = 'THFloatTensor',
+      ['THIndexTensor'] = 'THLongTensor',
+      ['THIntegerTensor'] = 'THIntTensor',
+      ['THIndex_t'] = 'long',
+      ['THInteger_t'] = 'int'
+    }
+}
+
+for i=1,#replacements do
+   local r = replacements[i]
+   local s = preprocessed
+   for k,v in pairs(r) do
+      s = string.gsub(s, k, v)
+   end
+   ffi.cdef(s)
+end
+
+THNN.NULL = ffi.NULL or nil
+
+function THNN.getState()
+   return ffi.NULL or nil
+end
+
+function THNN.optionalTensor(t)
+   return t and t:cdata() or THNN.NULL
+end
+
+local function extract_function_names(s)
+   local t = {}
+   for n in string.gmatch(s, 'TH_API void THNN_%(([%a%d_]+)%)') do
+      t[#t+1] = n
+   end
+   return t
+end
+
+function THNN.bind(lib, base_names, type_name, state_getter)
+   local ftable = {}
+   local prefix = 'THNN_' .. type_name
+   for i,n in ipairs(base_names) do
+      -- use pcall since some libs might not support all functions (e.g. cunn)
+      local ok,v = pcall(function() return lib[prefix .. n] end)
+      if ok then
+         ftable[n] = function(...) v(state_getter(), ...) end   -- implicitely add state
+      else
+         print('not found: ' .. prefix .. n .. v)
+      end
+   end
+   return ftable
+end
+
+-- build function table
+local function_names = extract_function_names(generic_THNN_h)
+
+THNN.kernels = {}
+THNN.kernels['torch.FloatTensor'] = THNN.bind(THNN.C, function_names, 'Float', THNN.getState)
+THNN.kernels['torch.DoubleTensor'] = THNN.bind(THNN.C, function_names, 'Double', THNN.getState)
+
+torch.getmetatable('torch.FloatTensor').THNN = THNN.kernels['torch.FloatTensor']
+torch.getmetatable('torch.DoubleTensor').THNN = THNN.kernels['torch.DoubleTensor']
+
+function THNN.runKernel(f, type, ...)
+   local ftable = THNN.kernels[type]
+   if not ftable then
+      error('Unsupported tensor type: '..type)
+   end
+   local f = ftable[f]
+   if not f then
+      error(string.format("Function '%s' not found for tensor type '%s'.", f, type))
+   end
+   f(...)
+end
+
+return THNN
diff --git a/contrib/lua-torch/nn/Tanh.lua b/contrib/lua-torch/nn/Tanh.lua
new file mode 100644
index 000000000..fc42cbbfd
--- /dev/null
+++ b/contrib/lua-torch/nn/Tanh.lua
@@ -0,0 +1,19 @@
+local Tanh = torch.class('nn.Tanh', 'nn.Module')
+
+function Tanh:updateOutput(input)
+   input.THNN.Tanh_updateOutput(
+      input:cdata(),
+      self.output:cdata()
+   )
+   return self.output
+end
+
+function Tanh:updateGradInput(input, gradOutput)
+   input.THNN.Tanh_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.output:cdata()
+   )
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/TanhShrink.lua b/contrib/lua-torch/nn/TanhShrink.lua
new file mode 100644
index 000000000..96df6c5b7
--- /dev/null
+++ b/contrib/lua-torch/nn/TanhShrink.lua
@@ -0,0 +1,20 @@
+local TanhShrink, parent = torch.class('nn.TanhShrink','nn.Module')
+
+function TanhShrink:__init()
+   parent.__init(self)
+   self.tanh = nn.Tanh()
+end
+
+function TanhShrink:updateOutput(input)
+   local th = self.tanh:updateOutput(input)
+   self.output:resizeAs(input):copy(input)
+   self.output:add(-1,th)
+   return self.output
+end
+
+function TanhShrink:updateGradInput(input, gradOutput)
+   local dth = self.tanh:updateGradInput(input,gradOutput)
+   self.gradInput:resizeAs(input):copy(gradOutput)
+   self.gradInput:add(-1,dth)
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/TemporalConvolution.lua b/contrib/lua-torch/nn/TemporalConvolution.lua
new file mode 100644
index 000000000..4b3a89eb6
--- /dev/null
+++ b/contrib/lua-torch/nn/TemporalConvolution.lua
@@ -0,0 +1,73 @@
+local TemporalConvolution, parent = torch.class('nn.TemporalConvolution', 'nn.Module')
+
+function TemporalConvolution:__init(inputFrameSize, outputFrameSize, kW, dW)
+   parent.__init(self)
+
+   dW = dW or 1
+
+   self.inputFrameSize = inputFrameSize
+   self.outputFrameSize = outputFrameSize
+   self.kW = kW
+   self.dW = dW
+
+   self.weight = torch.Tensor(outputFrameSize, inputFrameSize*kW)
+   self.bias = torch.Tensor(outputFrameSize)
+   self.gradWeight = torch.Tensor(outputFrameSize, inputFrameSize*kW)
+   self.gradBias = torch.Tensor(outputFrameSize)
+
+   self:reset()
+end
+
+function TemporalConvolution:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1/math.sqrt(self.kW*self.inputFrameSize)
+   end
+   if nn.oldSeed then
+      self.weight:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end)
+      self.bias:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end)
+   else
+      self.weight:uniform(-stdv, stdv)
+      self.bias:uniform(-stdv, stdv)
+   end
+end
+
+function TemporalConvolution:updateOutput(input)
+    input.THNN.TemporalConvolution_updateOutput(
+	input:cdata(), self.output:cdata(),
+	self.weight:cdata(), self.bias:cdata(),
+	self.kW, self.dW,
+	self.inputFrameSize, self.outputFrameSize
+    )
+   return self.output
+end
+
+function TemporalConvolution:updateGradInput(input, gradOutput)
+   if self.gradInput then
+      input.THNN.TemporalConvolution_updateGradInput(
+	  input:cdata(), gradOutput:cdata(),
+	  self.gradInput:cdata(), self.weight:cdata(),
+	  self.kW, self.dW
+       )
+      return self.gradInput
+   end
+end
+
+function TemporalConvolution:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   input.THNN.TemporalConvolution_accGradParameters(
+       input:cdata(), gradOutput:cdata(),
+       self.gradWeight:cdata(), self.gradBias:cdata(),
+       self.kW, self.dW, scale
+   )
+end
+
+function TemporalConvolution:sharedAccUpdateGradParameters(input, gradOutput, lr)
+   -- we do not need to accumulate parameters when sharing:
+   self:defaultAccUpdateGradParameters(input, gradOutput, lr)
+end
diff --git a/contrib/lua-torch/nn/TemporalDynamicKMaxPooling.lua b/contrib/lua-torch/nn/TemporalDynamicKMaxPooling.lua
new file mode 100644
index 000000000..644a0fa9c
--- /dev/null
+++ b/contrib/lua-torch/nn/TemporalDynamicKMaxPooling.lua
@@ -0,0 +1,65 @@
+--[[
+   This file implements Dynamic K Max Pooling as described in the paper:
+   "A Convolutional Neural Network for Modelling Sentences"
+                   by Nal Kalchbrenner, Edward Grefenstette, Phil Blunsom
+
+   The operation is simply selecting the k highest values out of a sequence.
+   k can be a calculated value or pre-defined
+
+   The value of k can be calulated as in the paper by using:
+      k_top as minK
+      (L-l)/L as factor
+
+   Where:
+      k_top is the desired sequence length at the end of the convolution part,
+      L is the total number of layers,
+      l is this layers number
+]]
+
+local TemporalDynamicKMaxPooling, parent = torch.class('nn.TemporalDynamicKMaxPooling', 'nn.Module')
+
+function TemporalDynamicKMaxPooling:__init(minK, factor)
+   parent.__init(self)
+
+   self.minK = minK
+   self.factor = factor or 0
+end
+
+function TemporalDynamicKMaxPooling:updateOutput(input)
+   assert(input:dim() == 2 or input:dim() == 3, 'Only 2D or 3D(batch mode) accepted')
+
+   local seqDim = input:dim()-1
+   local k = math.max(self.minK, math.ceil(self.factor*input:size(seqDim)))
+   assert(input:size(seqDim) >= self.minK, 'Input sequence length (' .. input:size(seqDim) .. ') too small for desired k value (' .. k .. ')')
+
+   -- Sort input in descending order
+   local sorted, allIndices = input:sort(seqDim,true)
+   -- Reduce the indices to only include the top-k and return to original order by sorting
+   self.indices = allIndices:narrow(seqDim, 1, k):sort(seqDim)
+
+   self.output = input:gather(seqDim, self.indices)
+
+   return self.output
+end
+
+function TemporalDynamicKMaxPooling:updateGradInput(input, gradOutput)
+   if self.gradInput then
+      local seqDim = input:dim()-1
+
+      self.gradInput:resizeAs(input)
+      self.gradInput:zero()
+
+      -- Using the previously stored indices, add the gradOutputs to their respective
+      -- input indices in the self.gradInput buffer
+      local updateValues = self.gradInput:gather(seqDim, self.indices)
+      updateValues:add(gradOutput)
+      self.gradInput:scatter(seqDim, self.indices, updateValues)
+
+      return self.gradInput
+   end
+end
+
+function TemporalDynamicKMaxPooling:clearState()
+   nn.utils.clear(self, 'indices')
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/TemporalMaxPooling.lua b/contrib/lua-torch/nn/TemporalMaxPooling.lua
new file mode 100644
index 000000000..894f4a99f
--- /dev/null
+++ b/contrib/lua-torch/nn/TemporalMaxPooling.lua
@@ -0,0 +1,44 @@
+local TemporalMaxPooling, parent = torch.class('nn.TemporalMaxPooling', 'nn.Module')
+
+function TemporalMaxPooling:__init(kW, dW)
+   parent.__init(self)
+
+   dW = dW or kW
+
+   self.kW = kW
+   self.dW = dW
+end
+
+function TemporalMaxPooling:updateOutput(input)
+   self.indices = self.indices or torch.LongTensor()
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
+       self.indices = torch.CudaLongTensor and self.indices:cudaLong() or self.indices
+   else
+       self.indices = self.indices:long()
+   end
+   input.THNN.TemporalMaxPooling_updateOutput(
+       input:cdata(), self.output:cdata(),
+       self.indices:cdata(), self.kW, self.dW
+   )
+   return self.output
+end
+
+function TemporalMaxPooling:updateGradInput(input, gradOutput)
+    if self.gradInput then
+	input.THNN.TemporalMaxPooling_updateGradInput(
+	    input:cdata(), gradOutput:cdata(),
+	    self.gradInput:cdata(), self.indices:cdata(),
+	    self.kW, self.dW
+	)
+	return self.gradInput
+    end
+end
+
+function TemporalMaxPooling:empty()
+   self:clearState()
+end
+
+function TemporalMaxPooling:clearState()
+   if self.indices then self.indices:set() end
+   return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/TemporalRowConvolution.lua b/contrib/lua-torch/nn/TemporalRowConvolution.lua
new file mode 100644
index 000000000..7c9d6a269
--- /dev/null
+++ b/contrib/lua-torch/nn/TemporalRowConvolution.lua
@@ -0,0 +1,120 @@
+local THNN = require "nn.THNN"
+
+local TemporalRowConvolution, parent = torch.class("nn.TemporalRowConvolution", "nn.Module")
+
+function TemporalRowConvolution:__init(inputFrameSize, kW, dW, featFirst)
+  parent.__init(self)
+
+  self.inputFrameSize = inputFrameSize
+  self.kW = kW
+  self.dW = dW or 1
+
+  self.weight = torch.Tensor(inputFrameSize, 1, kW)
+  self.bias = torch.Tensor(inputFrameSize)
+  self.gradWeight = torch.Tensor(inputFrameSize, 1, kW)
+  self.gradBias = torch.Tensor(inputFrameSize)
+
+  -- Set to true for batch x inputFrameSize x nInputFrame
+  self.featFirst = featFirst and true or false
+  self:reset()
+end
+
+function TemporalRowConvolution:noBias()
+  self.bias = nil
+  self.gradBias = nil
+  return self
+end
+
+function TemporalRowConvolution:reset(stdv)
+  if stdv then
+    stdv = stdv * math.sqrt(3)
+  else
+    stdv = 1 / math.sqrt(self.kW * self.inputFrameSize)
+  end
+  self.weight:uniform(-stdv, stdv)
+  self.bias:uniform(-stdv, stdv)
+end
+
+function TemporalRowConvolution:updateOutput(input)
+  assert(input.THNN, torch.type(input)..".THNN backend not imported")
+  self.finput = self.finput or input.new()
+  self.fgradInput = self.fgradInput or input.new()
+
+  input.THNN.TemporalRowConvolution_updateOutput(
+    input:cdata(),
+    self.output:cdata(),
+    self.weight:cdata(),
+    THNN.optionalTensor(self.bias),
+    self.finput:cdata(),
+    self.fgradInput:cdata(),
+    self.kW,
+    self.dW,
+    0, -- would be self.padW
+    self.featFirst
+  )
+
+  return self.output
+end
+
+function TemporalRowConvolution:updateGradInput(input, gradOutput)
+  assert(input.THNN, torch.type(input)..".THNN backend not imported")
+
+  if self.gradInput then
+    input.THNN.TemporalRowConvolution_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.weight:cdata(),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.kW,
+      self.dW,
+      0, -- would be self.padW
+      self.featFirst
+    )
+    return self.gradInput
+  end
+end
+
+function TemporalRowConvolution:accGradParameters(input, gradOutput, scale)
+  assert(input.THNN, torch.type(input)..".THNN backend not imported")
+
+  input.THNN.TemporalRowConvolution_accGradParameters(
+    input:cdata(),
+    gradOutput:cdata(),
+    self.gradWeight:cdata(),
+    THNN.optionalTensor(self.gradBias),
+    self.finput:cdata(),
+    self.fgradInput:cdata(),
+    self.kW,
+    self.dW,
+    0, -- would be self.padW
+    self.featFirst,
+    scale or 1)
+end
+
+function TemporalRowConvolution:type(type, tensorCache)
+  if self.finput then self.finput:set() end
+  if self.fgradInput then self.fgradInput:set() end
+  return parent.type(self, type, tensorCache)
+end
+
+function TemporalRowConvolution:__tostring__()
+  local s = string.format("%s(%d, %d", torch.type(self), self.inputFrameSize, self.kW)
+  if self.dW ~= 1 then
+    s = s .. string.format(", %d", self.dW)
+  end
+  if self.padW and self.padW ~= 0 then -- currently padding is not supported
+    s = s .. ", " .. self.padW
+  end
+  if self.bias then
+    return s .. ")"
+  else
+    return s .. ") without bias"
+  end
+end
+
+function TemporalRowConvolution:clearState()
+  nn.utils.clear(self, "finput", "fgradInput", "_input", "_gradOutput")
+  return parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/TemporalSubSampling.lua b/contrib/lua-torch/nn/TemporalSubSampling.lua
new file mode 100644
index 000000000..e9287d63d
--- /dev/null
+++ b/contrib/lua-torch/nn/TemporalSubSampling.lua
@@ -0,0 +1,64 @@
+local TemporalSubSampling, parent = torch.class('nn.TemporalSubSampling', 'nn.Module')
+
+function TemporalSubSampling:__init(inputFrameSize, kW, dW)
+   parent.__init(self)
+
+   dW = dW or 1
+
+   self.inputFrameSize = inputFrameSize
+   self.kW = kW
+   self.dW = dW
+
+   self.weight = torch.Tensor(inputFrameSize)
+   self.bias = torch.Tensor(inputFrameSize)
+   self.gradWeight = torch.Tensor(inputFrameSize)
+   self.gradBias = torch.Tensor(inputFrameSize)
+
+   self:reset()
+end
+
+function TemporalSubSampling:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1/math.sqrt(self.kW)
+   end
+   if nn.oldSeed then
+      self.weight:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end)
+      self.bias:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end)
+   else
+      self.weight:uniform(-stdv, stdv)
+      self.bias:uniform(-stdv, stdv)
+   end
+end
+
+function TemporalSubSampling:updateOutput(input)
+    input.THNN.TemporalSubSampling_updateOutput(
+	input:cdata(), self.output:cdata(),
+	self.weight:cdata(), self.bias:cdata(),
+	self.kW, self.dW, self.inputFrameSize
+    )
+   return self.output
+end
+
+function TemporalSubSampling:updateGradInput(input, gradOutput)
+    if self.gradInput then
+	input.THNN.TemporalSubSampling_updateGradInput(
+	    input:cdata(), gradOutput:cdata(), self.gradInput:cdata(),
+	    self.weight:cdata(), self.kW, self.dW
+	)
+	return self.gradInput
+   end
+end
+
+function TemporalSubSampling:accGradParameters(input, gradOutput, scale)
+    scale = scale or 1
+    input.THNN.TemporalSubSampling_accGradParameters(
+	input:cdata(), gradOutput:cdata(), self.gradWeight:cdata(),
+	self.gradBias:cdata(), self.kW, self.dW, scale
+    )
+end
diff --git a/contrib/lua-torch/nn/Threshold.lua b/contrib/lua-torch/nn/Threshold.lua
new file mode 100644
index 000000000..6fdd26408
--- /dev/null
+++ b/contrib/lua-torch/nn/Threshold.lua
@@ -0,0 +1,51 @@
+local Threshold, parent = torch.class('nn.Threshold','nn.Module')
+
+function Threshold:__init(th,v,ip)
+   parent.__init(self)
+   self.threshold = th or 1e-6
+   self.val = v or 0
+   if (th and type(th) ~= 'number') or (v and type(v) ~= 'number') then
+      error('nn.Threshold(threshold, value)')
+   end
+   -- default for inplace is false
+   self.inplace = ip or false
+   if (ip and type(ip) ~= 'boolean') then
+      error('in-place flag must be boolean')
+   end
+   self:validateParameters()
+end
+
+function Threshold:updateOutput(input)
+   self:validateParameters()
+   input.THNN.Threshold_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.threshold,
+      self.val,
+      self.inplace
+   )
+   return self.output
+end
+
+function Threshold:updateGradInput(input, gradOutput)
+   self:validateParameters()
+   input.THNN.Threshold_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.threshold,
+      self.val,
+      self.inplace
+   )
+   return self.gradInput
+end
+
+function Threshold:validateParameters()
+   self.inplace = self.inplace or false -- backwards compatibility pre inplace
+   if self.inplace then
+      if self.val > self.threshold then
+         error('in-place processing requires value (' .. self.val ..
+                  ') not exceed threshold (' .. self.threshold .. ')')
+      end
+   end
+end
diff --git a/contrib/lua-torch/nn/Transpose.lua b/contrib/lua-torch/nn/Transpose.lua
new file mode 100644
index 000000000..cceb2b643
--- /dev/null
+++ b/contrib/lua-torch/nn/Transpose.lua
@@ -0,0 +1,35 @@
+local Transpose, parent = torch.class('nn.Transpose', 'nn.Module')
+
+-- transpose dimensions:
+-- n = nn.Transpose({1,4},{1,3})
+-- will transpose dims 1 and 4, then 1 and 3...
+
+function Transpose:__init(...)
+   parent.__init(self)
+   self.permutations = {...}
+   self.numInputDims = nil
+end
+
+function Transpose:setNumInputDims(numInputDims)
+   self.numInputDims = numInputDims
+   return self
+end
+
+function Transpose:updateOutput(input)
+   local offset = self.numInputDims and input:nDimension()-self.numInputDims or 0
+   for _,perm in ipairs(self.permutations) do
+      input = input:transpose(perm[1]+offset,perm[2]+offset)
+   end
+   self.output:resizeAs(input):copy(input)
+   return self.output
+end
+
+function Transpose:updateGradInput(input, gradOutput)
+   for i = #self.permutations,1,-1 do
+      local perm = self.permutations[i]
+      local offset = self.numInputDims and input:nDimension()-self.numInputDims or 0
+      gradOutput = gradOutput:transpose(perm[1]+offset,perm[2]+offset)
+   end
+   self.gradInput:resizeAs(gradOutput):copy(gradOutput)
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/Unsqueeze.lua b/contrib/lua-torch/nn/Unsqueeze.lua
new file mode 100644
index 000000000..2e82a25a0
--- /dev/null
+++ b/contrib/lua-torch/nn/Unsqueeze.lua
@@ -0,0 +1,52 @@
+local Unsqueeze, parent = torch.class('nn.Unsqueeze', 'nn.Module')
+
+local function _assertTensor(t)
+   assert(torch.isTensor(t), "This module only works on tensor")
+end
+
+function Unsqueeze:__init(pos, numInputDims)
+   parent.__init(self)
+   self.pos = pos or error('the position to insert singleton dim not specified')
+   self:setNumInputDims(numInputDims)
+end
+
+function Unsqueeze:setNumInputDims(numInputDims)
+   self.numInputDims = numInputDims
+   return self
+end
+
+function Unsqueeze:updateOutput(input)
+   _assertTensor(input)
+   local actualPos = self:_getActualPosition(input)
+   nn.utils.addSingletonDimension(self.output, input, actualPos)
+   return self.output
+end
+
+function Unsqueeze:updateGradInput(input, gradOutput)
+   _assertTensor(input)
+   _assertTensor(gradOutput)
+   assert(input:nElement() == gradOutput:nElement())
+
+   self.gradInput:view(gradOutput, input:size())
+   return self.gradInput
+end
+
+function Unsqueeze:__tostring__()
+   return torch.type(self)..'(dim ' .. self.pos .. ')'
+end
+
+function Unsqueeze:_getActualPosition(input)
+   -- get valid dimesion offset for batchMode (if any)
+   local inputDim = input:dim() -- data batch dim
+   self.numInputDims = self.numInputDims or inputDim -- feature map dim
+   local offsetDim = inputDim - self.numInputDims
+   assert(offsetDim >= 0, "input feature map dim (numInputDims) must be <= input:dim()")
+
+   -- the actual position; clearer error message for batchMode (if any)
+   local actualPos = self.pos + offsetDim
+   assert(actualPos >= 1 and actualPos <= (inputDim + 1),
+      ("Invalid position: %d. input:dim() is %d, input feature map dim (numInputDims) is %d.")
+      :format(self.pos, inputDim, self.numInputDims)
+   )
+   return actualPos
+end
diff --git a/contrib/lua-torch/nn/View.lua b/contrib/lua-torch/nn/View.lua
new file mode 100644
index 000000000..542e57e16
--- /dev/null
+++ b/contrib/lua-torch/nn/View.lua
@@ -0,0 +1,96 @@
+local View, parent = torch.class('nn.View', 'nn.Module')
+
+function View:resetSize(...)
+   if select('#', ...) == 1 and torch.typename(select(1, ...)) == 'torch.LongStorage' then
+      self.size = select(1, ...)
+   else
+      self.size = torch.LongStorage({...})
+   end
+
+   self.numElements = 1
+   local inferdim = false
+   for i = 1,#self.size do
+      local szi = self.size[i]
+      if szi >= 0 then
+         self.numElements = self.numElements * self.size[i]
+      else
+         assert(szi == -1, 'size should be positive or -1')
+         assert(not inferdim, 'only one dimension can be at -1')
+         inferdim = true
+      end
+   end
+
+   return self
+end
+
+function View:__init(...)
+   parent.__init(self)
+   self:resetSize(...)
+   self.numInputDims = nil
+end
+
+function View:setNumInputDims(numInputDims)
+   self.numInputDims = numInputDims
+   return self
+end
+
+local function batchsize(input, size, numInputDims, numElements)
+   local ind = input:nDimension()
+   local isz = input:size()
+   local maxdim = numInputDims and numInputDims or ind
+   local ine = 1
+   for i=ind,ind-maxdim+1,-1 do
+      ine = ine * isz[i]
+   end
+
+   if ine % numElements ~= 0 then
+      error(string.format(
+               'input view (%s) and desired view (%s) do not match',
+               table.concat(input:size():totable(), 'x'),
+               table.concat(size:totable(), 'x')))
+   end
+
+   -- the remainder is either the batch...
+   local bsz = ine / numElements
+
+   -- ... or the missing size dim
+   for i=1,size:size() do
+      if size[i] == -1 then
+         bsz = 1
+         break
+      end
+   end
+
+   -- for dim over maxdim, it is definitively the batch
+   for i=ind-maxdim,1,-1 do
+      bsz = bsz * isz[i]
+   end
+
+   -- special card
+   if bsz == 1 and (not numInputDims or input:nDimension() <= numInputDims) then
+      return
+   end
+
+   return bsz
+end
+
+function View:updateOutput(input)
+   self.output = self.output or input.new()
+   local bsz = batchsize(input, self.size, self.numInputDims, self.numElements)
+   if bsz then
+      self.output:view(input, bsz, table.unpack(self.size:totable()))
+   else
+      self.output:view(input, self.size)
+   end
+   return self.output
+end
+
+function View:updateGradInput(input, gradOutput)
+   self.gradInput = self.gradInput or gradOutput.new()
+   self.gradInput:view(gradOutput, input:size())
+   return self.gradInput
+end
+
+function View:__tostring__()
+   return torch.type(self)..'('..table.concat(self.size:totable(), ', ')..')'
+end
diff --git a/contrib/lua-torch/nn/VolumetricAveragePooling.lua b/contrib/lua-torch/nn/VolumetricAveragePooling.lua
new file mode 100644
index 000000000..df6d2c405
--- /dev/null
+++ b/contrib/lua-torch/nn/VolumetricAveragePooling.lua
@@ -0,0 +1,54 @@
+local VolumetricAveragePooling, parent = torch.class(
+   'nn.VolumetricAveragePooling', 'nn.Module')
+
+function VolumetricAveragePooling:__init(kT, kW, kH, dT, dW, dH)
+   parent.__init(self)
+
+   dT = dT or kT
+   dW = dW or kW
+   dH = dH or kH
+
+   self.kT = kT
+   self.kH = kH
+   self.kW = kW
+   self.dT = dT
+   self.dW = dW
+   self.dH = dH
+end
+
+function VolumetricAveragePooling:updateOutput(input)
+   input.THNN.VolumetricAveragePooling_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.kT, self.kW, self.kH,
+      self.dT, self.dW, self.dH
+   )
+   return self.output
+end
+
+function VolumetricAveragePooling:updateGradInput(input, gradOutput)
+   input.THNN.VolumetricAveragePooling_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.kT, self.kW, self.kH,
+      self.dT, self.dW, self.dH
+   )
+   return self.gradInput
+end
+
+function VolumetricAveragePooling:empty()
+   return parent.clearState(self)
+end
+
+function VolumetricAveragePooling:__tostring__()
+   local s =  string.format('%s(%dx%dx%d, %d,%d,%d', torch.type(self),
+                            self.kT, self.kW, self.kH, self.dT, self.dW, self.dH)
+   if (self.padT or self.padW or self.padH) and
+      (self.padT ~= 0 or self.padW ~= 0 or self.padH ~= 0) then
+      s = s .. ', ' .. self.padT.. ',' .. self.padW .. ','.. self.padH
+   end
+   s = s .. ')'
+
+   return s
+end
diff --git a/contrib/lua-torch/nn/VolumetricBatchNormalization.lua b/contrib/lua-torch/nn/VolumetricBatchNormalization.lua
new file mode 100644
index 000000000..6168a9245
--- /dev/null
+++ b/contrib/lua-torch/nn/VolumetricBatchNormalization.lua
@@ -0,0 +1,4 @@
+local BN, parent = torch.class('nn.VolumetricBatchNormalization', 'nn.BatchNormalization')
+
+-- expected dimension of input
+BN.nDim = 5
diff --git a/contrib/lua-torch/nn/VolumetricConvolution.lua b/contrib/lua-torch/nn/VolumetricConvolution.lua
new file mode 100644
index 000000000..329609aff
--- /dev/null
+++ b/contrib/lua-torch/nn/VolumetricConvolution.lua
@@ -0,0 +1,169 @@
+local THNN = require 'nn.THNN'
+local VolumetricConvolution, parent = torch.class('nn.VolumetricConvolution', 'nn.Module')
+
+function VolumetricConvolution:__init(nInputPlane, nOutputPlane, kT, kW, kH, dT, dW, dH, padT, padW, padH)
+   parent.__init(self)
+
+   dT = dT or 1
+   dW = dW or 1
+   dH = dH or 1
+
+   self.nInputPlane = nInputPlane
+   self.nOutputPlane = nOutputPlane
+   self.kT = kT
+   self.kW = kW
+   self.kH = kH
+   self.dT = dT
+   self.dW = dW
+   self.dH = dH
+   self.padT = padT or 0
+   self.padW = padW or self.padT
+   self.padH = padH or self.padW
+
+   self.weight = torch.Tensor(nOutputPlane, nInputPlane, kT, kH, kW)
+   self.bias = torch.Tensor(nOutputPlane)
+   self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane, kT, kH, kW)
+   self.gradBias = torch.Tensor(nOutputPlane)
+   self:reset()
+end
+
+function VolumetricConvolution:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1/math.sqrt(self.kT*self.kW*self.kH*self.nInputPlane)
+   end
+   if nn.oldSeed then
+      self.weight:apply(function()
+         return torch.uniform(-stdv, stdv)
+      end)
+      if self.bias then
+         self.bias:apply(function()
+            return torch.uniform(-stdv, stdv)
+         end)
+      end
+   else
+      self.weight:uniform(-stdv, stdv)
+      if self.bias then
+         self.bias:uniform(-stdv, stdv)
+      end
+   end
+end
+
+function VolumetricConvolution:noBias()
+   self.bias = nil
+   self.gradBias = nil
+   return self
+end
+
+function VolumetricConvolution:updateOutput(input)
+   self.finput = self.finput or input.new()
+   self.fgradInput = self.fgradInput or input.new()
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
+      input.THNN.VolumetricConvolution_updateOutput(
+        input:cdata(),
+        self.output:cdata(),
+        self.weight:cdata(),
+        THNN.optionalTensor(self.bias),
+        self.finput:cdata(),
+        self.fgradInput:cdata(),
+        self.dT, self.dW, self.dH,
+        self.padT, self.padW, self.padH
+      )
+   else
+      input.THNN.VolumetricConvolutionMM_updateOutput(
+         input:cdata(),
+         self.output:cdata(),
+         self.weight:cdata(),
+         THNN.optionalTensor(self.bias),
+         self.finput:cdata(),
+         self.kT, self.kW, self.kH,
+         self.dT, self.dW, self.dH,
+         self.padT, self.padW, self.padH
+      )
+   end
+   return self.output
+end
+
+function VolumetricConvolution:updateGradInput(input, gradOutput)
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
+      input.THNN.VolumetricConvolution_updateGradInput(
+         input:cdata(),
+         gradOutput:cdata(),
+         self.gradInput:cdata(),
+         self.weight:cdata(),
+         self.finput:cdata(),
+         self.dT, self.dW, self.dH,
+         self.padT, self.padW, self.padH
+      )
+      return self.gradInput
+   else
+      if self.gradInput then
+         input.THNN.VolumetricConvolutionMM_updateGradInput(
+            input:cdata(),
+            gradOutput:cdata(),
+            self.gradInput:cdata(),
+            self.weight:cdata(),
+            self.finput:cdata(),
+            self.fgradInput:cdata(),
+            self.kT, self.kW, self.kH,
+            self.dT, self.dW, self.dH,
+            self.padT, self.padW, self.padH
+         )
+         return self.gradInput
+      end
+   end
+end
+
+function VolumetricConvolution:accGradParameters(input, gradOutput, scale)
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
+      input.THNN.VolumetricConvolution_accGradParameters(
+         input:cdata(),
+         gradOutput:cdata(),
+         self.gradWeight:cdata(),
+         THNN.optionalTensor(self.gradBias),
+         self.finput:cdata(),
+         self.fgradInput:cdata(),
+         self.dT, self.dW, self.dH,
+         self.padT, self.padW, self.padH,
+         scale or 1
+      )
+   else
+      input.THNN.VolumetricConvolutionMM_accGradParameters(
+         input:cdata(),
+         gradOutput:cdata(),
+         self.gradWeight:cdata(),
+         THNN.optionalTensor(self.gradBias),
+         self.finput:cdata(),
+         self.kT, self.kW, self.kH,
+         self.dT, self.dW, self.dH,
+         self.padT, self.padW, self.padH,
+         scale or 1
+      )
+   end
+end
+
+function VolumetricConvolution:type(type, tensorCache)
+   if self.finput then self.finput:set() end
+   if self.fgradInput then self.fgradInput:set() end
+   return parent.type(self, type, tensorCache)
+end
+
+function VolumetricConvolution:clearState()
+   nn.utils.clear(self, 'finput', 'fgradInput', '_input', '_gradOutput')
+   return parent.clearState(self)
+end
+
+function VolumetricConvolution:__tostring__()
+   local s = string.format('%s(%d -> %d, %dx%dx%d', torch.type(self),
+         self.nInputPlane, self.nOutputPlane, self.kT, self.kW, self.kH)
+   if self.dT ~= 1 or self.dW ~= 1 or self.dH ~= 1 or
+      self.padT ~= 0 or self.padW ~= 0 or self.padH ~= 0 then
+     s = s .. string.format(', %d,%d,%d', self.dT, self.dW, self.dH)
+   end
+   if (self.padT or self.padW or self.padH) and
+      (self.padT ~=0 or self.padW ~= 0 or self.padH ~= 0) then
+     s = s .. ', ' .. self.padT .. ',' .. self.padW .. ',' .. self.padH
+   end
+   return s .. ')'
+end
diff --git a/contrib/lua-torch/nn/VolumetricDilatedConvolution.lua b/contrib/lua-torch/nn/VolumetricDilatedConvolution.lua
new file mode 100644
index 000000000..f1337ebaa
--- /dev/null
+++ b/contrib/lua-torch/nn/VolumetricDilatedConvolution.lua
@@ -0,0 +1,84 @@
+local THNN = require 'nn.THNN'
+local VolumetricDilatedConvolution, parent = torch.class('nn.VolumetricDilatedConvolution', 'nn.VolumetricConvolution')
+
+function VolumetricDilatedConvolution:__init(nInputPlane, nOutputPlane, kT, kW, kH, dT, dW, dH, padT, padW, padH, dilationT, dilationW, dilationH)
+   parent.__init(self, nInputPlane, nOutputPlane, kT, kW, kH, dT, dW, dH, padT, padW, padH)
+
+   self.dilationT = dilationT or 1
+   self.dilationW = dilationW or 1
+   self.dilationH = dilationH or 1
+end
+
+function VolumetricDilatedConvolution:updateOutput(input)
+   self.finput = self.finput or self.weight.new()
+   self.fgradInput = self.fgradInput or self.weight.new()
+   input.THNN.VolumetricDilatedConvolution_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.weight:cdata(),
+      THNN.optionalTensor(self.bias),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.kT, self.kW, self.kH,
+      self.dT, self.dW, self.dH,
+      self.padT, self.padW, self.padH,
+      self.dilationT, self.dilationW, self.dilationH
+   )
+   return self.output
+end
+
+function VolumetricDilatedConvolution:updateGradInput(input, gradOutput)
+   if self.gradInput then
+      self.fgradInput = self.fgradInput or self.weight.new()
+      input.THNN.VolumetricDilatedConvolution_updateGradInput(
+         input:cdata(),
+         gradOutput:cdata(),
+         self.gradInput:cdata(),
+         self.weight:cdata(),
+         self.finput:cdata(),
+         self.kT, self.kW, self.kH,
+         self.dT, self.dW, self.dH,
+         self.padT, self.padW, self.padH,
+         self.dilationT, self.dilationW, self.dilationH
+      )
+      return self.gradInput
+   end
+end
+
+function VolumetricDilatedConvolution:accGradParameters(input, gradOutput, scale)
+   scale = scale or 1
+   self.fgradInput = self.fgradInput or self.weight.new()
+   input.THNN.VolumetricDilatedConvolution_accGradParameters(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradWeight:cdata(),
+      THNN.optionalTensor(self.gradBias),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.kT, self.kW, self.kH,
+      self.dT, self.dW, self.dH,
+      self.padT, self.padW, self.padH,
+      self.dilationT, self.dilationW, self.dilationH,
+      scale
+   )
+end
+
+function VolumetricDilatedConvolution:__tostring__()
+   local s = string.format('%s(%d -> %d, %dx%dx%d', torch.type(self),
+         self.nInputPlane, self.nOutputPlane, self.kT, self.kW, self.kH)
+   if self.dT ~= 1 or self.dW ~= 1 or self.dH ~= 1
+   or self.padT ~= 0 or self.padW ~= 0 or self.padH ~= 0 then
+     s = s .. string.format(', %d,%d,%d', self.dT, self.dW, self.dH)
+   end
+   if (self.padT or self.padW or self.padH)
+   and (self.padT ~= 0 or self.padW ~= 0 or self.padH ~= 0) then
+     s = s .. ', ' .. self.padT .. ',' .. self.padW .. ',' .. self.padH
+   end
+   s = s .. ', ' .. self.dilationT .. ','
+       .. self.dilationW .. ',' .. self.dilationH
+   if self.bias then
+      return s .. ')'
+   else
+      return s .. ') without bias'
+   end
+end
diff --git a/contrib/lua-torch/nn/VolumetricDilatedMaxPooling.lua b/contrib/lua-torch/nn/VolumetricDilatedMaxPooling.lua
new file mode 100644
index 000000000..249b2b58e
--- /dev/null
+++ b/contrib/lua-torch/nn/VolumetricDilatedMaxPooling.lua
@@ -0,0 +1,71 @@
+local THNN = require 'nn.THNN'
+local VolumetricDilatedMaxPooling, parent = torch.class('nn.VolumetricDilatedMaxPooling', 'nn.VolumetricMaxPooling')
+
+function VolumetricDilatedMaxPooling:__init(kT, kW, kH, dT, dW, dH, padT, padW, padH, dilationT, dilationW, dilationH)
+   parent.__init(self, kT, kW, kH, dT, dW, dH, padT, padW, padH)
+
+   self.dilationT = dilationT or 1
+   self.dilationW = dilationW or 1
+   self.dilationH = dilationH or 1
+
+end
+
+function VolumetricDilatedMaxPooling:updateOutput(input)
+   local dims = input:dim()
+   self.itime = input:size(dims-2)
+   self.iheight = input:size(dims-1)
+   self.iwidth = input:size(dims)
+
+   self.indices = self.indices or torch.LongTensor()
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
+      self.indices = torch.CudaLongTensor and self.indices:cudaLong() or self.indices
+   else
+      self.indices = self.indices:long()
+   end
+   input.THNN.VolumetricDilatedMaxPooling_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.indices:cdata(),
+      self.kT, self.kW, self.kH,
+      self.dT, self.dW, self.dH,
+      self.padT, self.padW, self.padH,
+      self.dilationT, self.dilationW, self.dilationH,
+      self.ceil_mode
+   )
+   return self.output
+end
+
+function VolumetricDilatedMaxPooling:updateGradInput(input, gradOutput)
+   input.THNN.VolumetricDilatedMaxPooling_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.indices:cdata(),
+      self.kT, self.kW, self.kH,
+      self.dT, self.dW, self.dH,
+      self.padT, self.padW, self.padH,
+      self.dilationT, self.dilationW, self.dilationH,
+      self.ceil_mode
+   )
+   return self.gradInput
+end
+
+function VolumetricDilatedMaxPooling:clearState()
+   if self.indices then
+      self.indices:set()
+   end
+   return parent.clearState(self)
+end
+
+function VolumetricDilatedMaxPooling:__tostring__()
+   local s =  string.format('%s(%dx%dx%d, %d,%d,%d', torch.type(self),
+                            self.kT, self.kW, self.kH, self.dT, self.dW, self.dH)
+   if (self.padT or self.padW or self.padH) and
+      (self.padT ~= 0 or self.padW ~= 0 or self.padH ~= 0) then
+      s = s .. ', ' .. self.padT.. ',' .. self.padW .. ','.. self.padH
+   end
+   s = s .. ', ' .. self.dilationT .. ',' .. self.dilationW .. ',' .. self.dilationH
+   s = s .. ')'
+
+   return s
+end
diff --git a/contrib/lua-torch/nn/VolumetricDropout.lua b/contrib/lua-torch/nn/VolumetricDropout.lua
new file mode 100644
index 000000000..809e28afe
--- /dev/null
+++ b/contrib/lua-torch/nn/VolumetricDropout.lua
@@ -0,0 +1,55 @@
+local VolumetricDropout, Parent = torch.class('nn.VolumetricDropout', 'nn.Module')
+
+function VolumetricDropout:__init(p,stochasticInference)
+   Parent.__init(self)
+   self.p = p or 0.5
+   self.train = true
+   self.stochastic_inference = stochasticInference or false
+   self.noise = torch.Tensor()
+end
+
+function VolumetricDropout:updateOutput(input)
+   self.output:resizeAs(input):copy(input)
+   if self.train or self.stochastic_inference then
+      if input:dim() == 5 then
+        self.noise:resize(input:size(1), input:size(2), 1, 1, 1)
+      elseif input:dim() == 4 then
+        self.noise:resize(input:size(1), 1, 1, 1)
+      else
+        error('Input must be 5D (nbatch, nfeat, t, h, w) or 4D (nfeat, t, h, w)')
+      end
+      self.noise:bernoulli(1-self.p)
+      -- We expand the random dropouts to the entire feature map because the
+      -- features are likely correlated across the map and so the dropout
+      -- should also be correlated.
+      self.output:cmul(torch.expandAs(self.noise, input))
+   else
+      self.output:mul(1-self.p)
+   end
+   return self.output
+end
+
+function VolumetricDropout:updateGradInput(input, gradOutput)
+   if self.train then
+      self.gradInput:resizeAs(gradOutput):copy(gradOutput)
+      self.gradInput:cmul(torch.expandAs(self.noise, input)) -- simply mask the gradients with the noise vector
+   else
+      error('backprop only defined while training')
+   end
+   return self.gradInput
+end
+
+function VolumetricDropout:setp(p)
+   self.p = p
+end
+
+function VolumetricDropout:__tostring__()
+  return string.format('%s(%f)', torch.type(self), self.p)
+end
+
+function VolumetricDropout:clearState()
+  if self.noise then
+    self.noise:set()
+  end
+  return Parent.clearState(self)
+end
diff --git a/contrib/lua-torch/nn/VolumetricFractionalMaxPooling.lua b/contrib/lua-torch/nn/VolumetricFractionalMaxPooling.lua
new file mode 100644
index 000000000..f5ff58cf0
--- /dev/null
+++ b/contrib/lua-torch/nn/VolumetricFractionalMaxPooling.lua
@@ -0,0 +1,175 @@
+local VolumetricFractionalMaxPooling, parent =
+   torch.class('nn.VolumetricFractionalMaxPooling', 'nn.Module')
+
+-- Usage:
+-- nn.VolumetricFractionalMaxPooling(poolSizeT, poolSizeW, poolSizeH, outT, outW, outH)
+--   the output should be the exact size (outT x outH x outW)
+-- nn.VolumetricFractionalMaxPooling(poolSizeT, poolSizeW, poolSizeH, ratioT, ratioW, ratioH)
+--   the output should be the size (floor(inT x ratioT) x floor(inH x ratioH) x floor(inW x ratioW))
+--   ratios are numbers between (0, 1) exclusive
+function VolumetricFractionalMaxPooling:__init(poolSizeT, poolSizeW, poolSizeH, arg1, arg2, arg3)
+   parent.__init(self)
+   assert(poolSizeT >= 2)
+   assert(poolSizeW >= 2)
+   assert(poolSizeH >= 2)
+
+   -- Pool size (how wide the pooling for each output unit is)
+   self.poolSizeT = poolSizeT
+   self.poolSizeW = poolSizeW
+   self.poolSizeH = poolSizeH
+
+   -- Random samples are drawn for all
+   -- batch * plane * (time, height, width; i.e., 3) points. This determines
+   -- the 3d "pseudorandom" overlapping pooling regions for each
+   -- (batch element x input plane). A new set of random samples is
+   -- drawn every updateOutput call, unless we disable it via
+   -- :fixPoolingRegions().
+   self.randomSamples = nil
+
+   -- Flag to disable re-generation of random samples for producing
+   -- a new pooling. For testing purposes
+   self.newRandomPool = false
+
+   if arg1 >= 1 and arg2 >= 1 and arg3 >= 1 then
+      -- Desired output size: the input tensor will determine the reduction
+      -- ratio
+      self.outT = arg1
+      self.outW = arg2
+      self.outH = arg3
+   else
+      -- Reduction ratio specified per each input
+      -- This is the reduction ratio that we use
+      self.ratioT = arg1
+      self.ratioW = arg2
+      self.ratioH = arg3
+
+      -- The reduction ratio must be between 0 and 1
+      assert(self.ratioT > 0 and self.ratioT < 1)
+      assert(self.ratioW > 0 and self.ratioW < 1)
+      assert(self.ratioH > 0 and self.ratioH < 1)
+   end
+end
+
+function VolumetricFractionalMaxPooling:getBufferSize_(input)
+   local batchSize = 0
+   local planeSize = 0
+
+   if input:nDimension() == 4 then
+      batchSize = 1
+      planeSize = input:size(1)
+   elseif input:nDimension() == 5 then
+      batchSize = input:size(1)
+      planeSize = input:size(2)
+   else
+      error('input must be dim 4 or 5')
+   end
+
+   return torch.LongStorage({batchSize, planeSize, 3})
+end
+
+function VolumetricFractionalMaxPooling:initSampleBuffer_(input)
+   local sampleBufferSize = self:getBufferSize_(input)
+
+   if self.randomSamples == nil then
+      self.randomSamples = input.new():resize(sampleBufferSize):uniform()
+   elseif (self.randomSamples:size(1) ~= sampleBufferSize[1] or
+           self.randomSamples:size(2) ~= sampleBufferSize[2]) then
+      self.randomSamples:resize(sampleBufferSize):uniform()
+   else
+      if not self.newRandomPool then
+         -- Create new pooling windows, since this is a subsequent call
+         self.randomSamples:uniform()
+      end
+   end
+end
+
+function VolumetricFractionalMaxPooling:getOutputSizes_(input)
+   local outT = self.outT
+   local outW = self.outW
+   local outH = self.outH
+   if self.ratioW ~= nil and self.ratioH ~= nil then
+      if input:nDimension() == 5 then
+         outT = math.floor(input:size(5) * self.ratioT)
+         outW = math.floor(input:size(4) * self.ratioW)
+         outH = math.floor(input:size(3) * self.ratioH)
+      elseif input:nDimension() == 4 then
+         outT = math.floor(input:size(4) * self.ratioT)
+         outW = math.floor(input:size(3) * self.ratioW)
+         outH = math.floor(input:size(2) * self.ratioH)
+      else
+         error('input must be dim 4 or 5')
+      end
+
+      -- Neither can be smaller than 1
+      assert(outT > 0, 'reduction ratio or input time too small')
+      assert(outW > 0, 'reduction ratio or input width too small')
+      assert(outH > 0, 'reduction ratio or input height too small')
+   else
+      assert(outT ~= nil and outW ~= nil and outH ~= nil)
+   end
+
+   return outT, outW, outH
+end
+
+-- Call this to turn off regeneration of random pooling regions each
+-- updateOutput call.
+function VolumetricFractionalMaxPooling:fixPoolingRegions(val)
+   if val == nil then
+      val = true
+   end
+
+   self.newRandomPool = val
+   return self
+end
+
+function VolumetricFractionalMaxPooling:updateOutput(input)
+   self.indices = self.indices or torch.LongTensor()
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
+      self.indices = torch.CudaLongTensor and self.indices:cudaLong() or self.indices
+   else
+      self.indices = self.indices:long()
+   end
+   self:initSampleBuffer_(input)
+   local outT, outW, outH = self:getOutputSizes_(input)
+
+   input.THNN.VolumetricFractionalMaxPooling_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      outT, outW, outH, self.poolSizeT, self.poolSizeW, self.poolSizeH,
+      self.indices:cdata(), self.randomSamples:cdata())
+   return self.output
+end
+
+function VolumetricFractionalMaxPooling:updateGradInput(input, gradOutput)
+   assert(self.randomSamples ~= nil,
+          'must call updateOutput/forward first')
+
+   local outT, outW, outH = self:getOutputSizes_(input)
+
+   input.THNN.VolumetricFractionalMaxPooling_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      outT, outW, outH, self.poolSizeT, self.poolSizeW, self.poolSizeH,
+      self.indices:cdata())
+   return self.gradInput
+end
+
+-- backward compat
+function VolumetricFractionalMaxPooling:empty()
+   self:clearState()
+end
+
+function VolumetricFractionalMaxPooling:clearState()
+   self.indices = nil
+   self.randomSamples = nil
+   return parent.clearState(self)
+end
+
+function VolumetricFractionalMaxPooling:__tostring__()
+   return string.format('%s(%dx%dx%d, %d,%d,%d)', torch.type(self),
+                        self.outT and self.outT or self.ratioT,
+                        self.outW and self.outW or self.ratioW,
+                        self.outH and self.outH or self.ratioH,
+                        self.poolSizeT, self.poolSizeW, self.poolSizeH)
+end
diff --git a/contrib/lua-torch/nn/VolumetricFullConvolution.lua b/contrib/lua-torch/nn/VolumetricFullConvolution.lua
new file mode 100644
index 000000000..0ce23401e
--- /dev/null
+++ b/contrib/lua-torch/nn/VolumetricFullConvolution.lua
@@ -0,0 +1,225 @@
+local THNN = require 'nn.THNN'
+local VolumetricFullConvolution, parent = torch.class('nn.VolumetricFullConvolution','nn.Module')
+
+function VolumetricFullConvolution:__init(nInputPlane, nOutputPlane,
+                                          kT, kW, kH,         -- kernel size
+                                          dT, dW, dH,         -- stride
+                                          padT, padW, padH,   -- padding
+                                          adjT, adjW, adjH)   -- extra output adjustment
+   parent.__init(self)
+
+   dW = dW or 1
+   dH = dH or 1
+   dT = dT or 1
+
+   self.nInputPlane = nInputPlane
+   self.nOutputPlane = nOutputPlane
+   self.kW = kW
+   self.kH = kH
+   self.kT = kT
+   self.dW = dW
+   self.dH = dH
+   self.dT = dT
+   self.padW = padW or 0
+   self.padH = padH or 0
+   self.padT = padT or 0
+   self.adjW = adjW or 0
+   self.adjH = adjH or 0
+   self.adjT = adjT or 0
+
+   if self.adjW > self.dW - 1 or self.adjH > self.dH - 1 or self.adjT > self.dT - 1 then
+      error('adjW, adjH and adjT must be smaller than self.dW - 1,' ..
+            ' self.dH - 1 and self.dT - 1 respectively')
+   end
+
+   self.weight = torch.Tensor(nInputPlane, nOutputPlane, kT, kH, kW)
+   self.gradWeight = torch.Tensor(nInputPlane, nOutputPlane, kT, kH, kW)
+   self.bias = torch.Tensor(self.nOutputPlane)
+   self.gradBias = torch.Tensor(self.nOutputPlane)
+
+   self.ones = torch.Tensor()
+   self.finput = torch.Tensor()
+   self.fgradInput = torch.Tensor()
+
+   self:reset()
+end
+
+function VolumetricFullConvolution:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      local nInputPlane = self.nInputPlane
+      local kT = self.kT
+      local kH = self.kH
+      local kW = self.kW
+      stdv = 1/math.sqrt(kW*kH*kT*nInputPlane)
+   end
+   self.weight:uniform(-stdv, stdv)
+   self.bias:uniform(-stdv, stdv)
+end
+
+local function calculateAdj(targetSize, ker, pad, stride)
+  return (targetSize + 2 * pad - ker) % stride
+end
+
+function VolumetricFullConvolution:backCompatibility()
+   -- Transpose the weight when loading from an old version
+   if not self.adjW then
+      self.weight = self.weight:transpose(1, 2):contiguous()
+   end
+
+   -- Rename the padding when loading from an old version
+   self.padW = self.padW or self.pW
+   self.padH = self.padH or self.pH
+   self.padT = self.padT or self.pT
+
+   self.adjW = self.adjW or 0
+   self.adjH = self.adjH or 0
+   self.adjT = self.adjT or 0
+end
+
+
+function VolumetricFullConvolution:noBias()
+   self.bias = nil
+   self.gradBias = nil
+   return self
+end
+
+function VolumetricFullConvolution:updateOutput(input)
+   self:backCompatibility()
+
+  local inputTensor = input
+  local adjT, adjW, adjH = self.adjT, self.adjW, self.adjH
+
+  -- The input can be a table where the second element indicates the target
+  -- output size, in which case the adj factors are computed automatically
+  if type(inputTensor) == 'table' then
+    inputTensor = input[1]
+    local targetTensor = input[2]
+    local tDims = targetTensor:dim()
+    local tT = targetTensor:size(tDims-2)
+    local tH = targetTensor:size(tDims-1)
+    local tW = targetTensor:size(tDims)
+    adjT = calculateAdj(tT, self.kT, self.padT, self.dT)
+    adjW = calculateAdj(tW, self.kW, self.padW, self.dW)
+    adjH = calculateAdj(tH, self.kH, self.padH, self.dH)
+  end
+
+   inputTensor.THNN.VolumetricFullConvolution_updateOutput(
+      inputTensor:cdata(),
+      self.output:cdata(),
+      self.weight:cdata(),
+      THNN.optionalTensor(self.bias),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.dT, self.dW, self.dH,
+      self.padT, self.padW, self.padH,
+      adjT, adjW, adjH
+   )
+
+   return self.output
+end
+
+function VolumetricFullConvolution:updateGradInput(input, gradOutput)
+   self:backCompatibility()
+
+    local inputTensor = input
+    local adjT, adjW, adjH = self.adjT, self.adjW, self.adjH
+
+    -- The input can be a table where the second element indicates the target
+    -- output size, in which case the adj factors are computed automatically
+    if type(inputTensor) == 'table' then
+      inputTensor = input[1]
+      local targetTensor = input[2]
+      local tDims = targetTensor:dim()
+      local tT = targetTensor:size(tDims-2)
+      local tH = targetTensor:size(tDims-1)
+      local tW = targetTensor:size(tDims)
+      adjT = calculateAdj(tT, self.kT, self.padT, self.dT)
+      adjW = calculateAdj(tW, self.kW, self.padW, self.dW)
+      adjH = calculateAdj(tH, self.kH, self.padH, self.dH)
+      -- Momentarily extract the gradInput tensor
+      if type(self.gradInput) == 'table' then
+        self.gradInput = self.gradInput[1]
+      end
+    end
+
+   inputTensor.THNN.VolumetricFullConvolution_updateGradInput(
+      inputTensor:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.weight:cdata(),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.dT, self.dW, self.dH,
+      self.padT, self.padW, self.padH,
+      adjT, adjW, adjH
+   )
+
+    if type(input) == 'table' then
+     -- Create a zero tensor to be expanded and used as gradInput[2].
+      self.zeroScalar = self.zeroScalar or input[2].new(1):zero()
+      self.ones:resize(input[2]:dim()):fill(1)
+      local zeroTensor =  self.zeroScalar
+          :view(table.unpack(self.ones:totable()))
+          :expandAs(input[2])
+      self.gradInput = {self.gradInput, zeroTensor}
+    end
+
+   return self.gradInput
+end
+
+function VolumetricFullConvolution:accGradParameters(input, gradOutput, scale)
+   self:backCompatibility()
+
+  local inputTensor = input
+  local adjT, adjW, adjH = self.adjT, self.adjW, self.adjH
+
+  -- The input can be a table where the second element indicates the target
+  -- output size, in which case the adj factors are computed automatically
+  if type(inputTensor) == 'table' then
+    inputTensor = input[1]
+    local targetTensor = input[2]
+    local tDims = targetTensor:dim()
+    local tT = targetTensor:size(tDims-2)
+    local tH = targetTensor:size(tDims-1)
+    local tW = targetTensor:size(tDims)
+    adjT = calculateAdj(tT, self.kT, self.padT, self.dT)
+    adjW = calculateAdj(tW, self.kW, self.padW, self.dW)
+    adjH = calculateAdj(tH, self.kH, self.padH, self.dH)
+  end
+
+   inputTensor.THNN.VolumetricFullConvolution_accGradParameters(
+      inputTensor:cdata(),
+      gradOutput:cdata(),
+      self.gradWeight:cdata(),
+      THNN.optionalTensor(self.gradBias),
+      self.finput:cdata(),
+      self.fgradInput:cdata(),
+      self.dT, self.dW, self.dH,
+      self.padT, self.padW, self.padH,
+      adjT, adjW, adjH,
+      scale or 1
+   )
+end
+
+function VolumetricFullConvolution:type(type, tensorCache)
+   self.finput = torch.Tensor()
+   self.fgradInput = torch.Tensor()
+   return parent.type(self, type, tensorCache)
+end
+
+function VolumetricFullConvolution:__tostring__()
+   local s = string.format('%s(%d -> %d, %dx%dx%d', torch.type(self),
+   self.nInputPlane, self.nOutputPlane, self.kT, self.kW, self.kH)
+   if self.dT ~= 1 or self.dW ~= 1 or self.dH ~= 1 or self.padT ~= 0 or self.padW ~= 0 or self.padH ~= 0 then
+      s = s .. string.format(', %d,%d,%d', self.dT, self.dW, self.dH)
+   end
+   if (self.padT or self.padW or self.padH) and (self.padT ~= 0 or self.padW ~= 0 or self.padH ~= 0) then
+      s = s .. ', ' .. self.padT .. ',' .. self.padW .. ',' .. self.padH
+   end
+   if (self.adjT or self.adjW or self.adjH) and (self.adjT ~= 0 or self.adjW ~= 0 or self.adjH ~= 0) then
+      s = s .. ', ' .. self.adjT .. ',' .. self.adjW .. ',' .. self.adjH
+   end
+   return s .. ')'
+end
diff --git a/contrib/lua-torch/nn/VolumetricMaxPooling.lua b/contrib/lua-torch/nn/VolumetricMaxPooling.lua
new file mode 100644
index 000000000..e25c5b31c
--- /dev/null
+++ b/contrib/lua-torch/nn/VolumetricMaxPooling.lua
@@ -0,0 +1,102 @@
+local VolumetricMaxPooling, parent = torch.class('nn.VolumetricMaxPooling', 'nn.Module')
+
+VolumetricMaxPooling.__version = 2
+
+function VolumetricMaxPooling:__init(kT, kW, kH, dT, dW, dH, padT, padW, padH)
+   parent.__init(self)
+
+   dT = dT or kT
+   dW = dW or kW
+   dH = dH or kH
+
+   self.kT = kT
+   self.kH = kH
+   self.kW = kW
+   self.dT = dT
+   self.dW = dW
+   self.dH = dH
+
+   self.padT = padT or 0
+   self.padW = padW or 0
+   self.padH = padH or 0
+
+
+   self.ceil_mode = false
+   self.indices = torch.LongTensor()
+end
+
+function VolumetricMaxPooling:ceil()
+    self.ceil_mode = true
+    return self
+end
+
+function VolumetricMaxPooling:floor()
+    self.ceil_mode = false
+    return self
+end
+
+function VolumetricMaxPooling:updateOutput(input)
+   local dims = input:dim()
+   self.itime = input:size(dims-2)
+   self.iheight = input:size(dims-1)
+   self.iwidth = input:size(dims)
+
+   self.indices = self.indices or torch.LongTensor()
+   if torch.typename(input):find('torch%.Cuda.*Tensor') then
+      self.indices = torch.CudaLongTensor and self.indices:cudaLong() or self.indices
+   else
+      self.indices = self.indices:long()
+   end
+   input.THNN.VolumetricMaxPooling_updateOutput(
+      input:cdata(),
+      self.output:cdata(),
+      self.indices:cdata(),
+      self.kT, self.kW, self.kH,
+      self.dT, self.dW, self.dH,
+      self.padT, self.padW, self.padH,
+      self.ceil_mode
+   )
+   return self.output
+end
+
+function VolumetricMaxPooling:updateGradInput(input, gradOutput)
+   input.THNN.VolumetricMaxPooling_updateGradInput(
+      input:cdata(),
+      gradOutput:cdata(),
+      self.gradInput:cdata(),
+      self.indices:cdata(),
+      self.kT, self.kW, self.kH,
+      self.dT, self.dW, self.dH,
+      self.padT, self.padW, self.padH,
+      self.ceil_mode
+   )
+   return self.gradInput
+end
+
+function VolumetricMaxPooling:empty()
+   self:clearState()
+end
+
+function VolumetricMaxPooling:clearState()
+   if self.indices then self.indices:set() end
+   return parent.clearState(self)
+end
+
+function VolumetricMaxPooling:read(file, version)
+   parent.read(self, file)
+   if version < 2 then
+      self.ceil_mode = false
+   end
+end
+
+function VolumetricMaxPooling:__tostring__()
+   local s =  string.format('%s(%dx%dx%d, %d,%d,%d', torch.type(self),
+                            self.kT, self.kW, self.kH, self.dT, self.dW, self.dH)
+   if (self.padT or self.padW or self.padH) and
+      (self.padT ~= 0 or self.padW ~= 0 or self.padH ~= 0) then
+      s = s .. ', ' .. self.padT.. ',' .. self.padW .. ','.. self.padH
+   end
+   s = s .. ')'
+
+   return s
+end
diff --git a/contrib/lua-torch/nn/VolumetricMaxUnpooling.lua b/contrib/lua-torch/nn/VolumetricMaxUnpooling.lua
new file mode 100644
index 000000000..6291f5b85
--- /dev/null
+++ b/contrib/lua-torch/nn/VolumetricMaxUnpooling.lua
@@ -0,0 +1,56 @@
+local VolumetricMaxUnpooling, parent = torch.class('nn.VolumetricMaxUnpooling', 'nn.Module')
+
+function VolumetricMaxUnpooling:__init(poolingModule)
+  parent.__init(self)
+  assert(torch.type(poolingModule)=='nn.VolumetricMaxPooling', 'Argument must be a nn.VolumetricMaxPooling module')
+  assert(poolingModule.kT==poolingModule.dT and poolingModule.kH==poolingModule.dH and poolingModule.kW==poolingModule.dW, "The size of pooling module's kernel must be equal to its stride")
+  self.pooling = poolingModule
+end
+
+function VolumetricMaxUnpooling:setParams()
+  self.indices = self.pooling.indices
+  self.otime = self.pooling.itime
+  self.oheight = self.pooling.iheight
+  self.owidth = self.pooling.iwidth
+  self.dT = self.pooling.dT
+  self.dH = self.pooling.dH
+  self.dW = self.pooling.dW
+  self.padT = self.pooling.padT
+  self.padH = self.pooling.padH
+  self.padW = self.pooling.padW
+end
+
+function VolumetricMaxUnpooling:updateOutput(input)
+  self:setParams()
+  input.THNN.VolumetricMaxUnpooling_updateOutput(
+     input:cdata(),
+     self.output:cdata(),
+     self.indices:cdata(),
+     self.otime, self.owidth, self.oheight,
+     self.dT, self.dW, self.dH,
+     self.padT, self.padW, self.padH
+  )
+  return self.output
+end
+
+function VolumetricMaxUnpooling:updateGradInput(input, gradOutput)
+  self:setParams()
+  input.THNN.VolumetricMaxUnpooling_updateGradInput(
+     input:cdata(),
+     gradOutput:cdata(),
+     self.gradInput:cdata(),
+     self.indices:cdata(),
+     self.otime, self.owidth, self.oheight,
+     self.dT, self.dW, self.dH,
+     self.padT, self.padW, self.padH
+  )
+  return self.gradInput
+end
+
+function VolumetricMaxUnpooling:empty()
+   self:clearState()
+end
+
+function VolumetricMaxUnpooling:__tostring__()
+   return 'nn.VolumetricMaxUnpooling associated to '..tostring(self.pooling)
+end
diff --git a/contrib/lua-torch/nn/VolumetricReplicationPadding.lua b/contrib/lua-torch/nn/VolumetricReplicationPadding.lua
new file mode 100644
index 000000000..31a9503fd
--- /dev/null
+++ b/contrib/lua-torch/nn/VolumetricReplicationPadding.lua
@@ -0,0 +1,58 @@
+local VolumetricReplicationPadding, parent =
+   torch.class('nn.VolumetricReplicationPadding', 'nn.Module')
+
+function VolumetricReplicationPadding:__init(pleft, pright, ptop, pbottom,
+                                             pfront, pback)
+   parent.__init(self)
+   self.pleft = pleft
+   self.pright = pright or self.pleft
+   self.ptop = ptop or self.pleft
+   self.pbottom = pbottom or self.pleft
+   self.pfront = pfront or self.pleft
+   self.pback = pback or self.pleft
+end
+
+function VolumetricReplicationPadding:updateOutput(input)
+   if input:dim() == 4 or input:dim() == 5 then
+      input.THNN.VolumetricReplicationPadding_updateOutput(
+         input:cdata(), self.output:cdata(),
+         self.pleft, self.pright, self.ptop, self.pbottom, self.pfront,
+         self.pback)
+   else
+      error('input must be 4 or 5-dimensional')
+   end
+   return self.output
+end
+
+function VolumetricReplicationPadding:updateGradInput(input, gradOutput)
+   if input:dim() == 4 and gradOutput:dim() == 4 then
+      assert(input:size(1) == gradOutput:size(1)
+             and input:size(2) + self.pfront + self.pback == gradOutput:size(2)
+             and input:size(3) + self.ptop + self.pbottom == gradOutput:size(3)
+             and input:size(4) + self.pleft + self.pright == gradOutput:size(4),
+             'input and gradOutput must be compatible in size')
+   elseif input:dim() == 5 and gradOutput:dim() == 5 then
+      assert(input:size(1) == gradOutput:size(1)
+             and input:size(2) == gradOutput:size(2)
+             and input:size(3) + self.pfront + self.pback == gradOutput:size(3)
+             and input:size(4) + self.ptop + self.pbottom == gradOutput:size(4)
+             and input:size(5) + self.pleft + self.pright == gradOutput:size(5),
+             'input and gradOutput must be compatible in size')
+   else
+      error(
+         [[input and gradOutput must be 4 or 5-dimensional
+         and have equal number of dimensions]]
+         )
+   end
+   input.THNN.VolumetricReplicationPadding_updateGradInput(
+      input:cdata(), gradOutput:cdata(), self.gradInput:cdata(),
+      self.pleft, self.pright, self.ptop, self.pbottom, self.pfront, self.pback)
+   return self.gradInput
+end
+
+function VolumetricReplicationPadding:__tostring__()
+   return torch.type(self) ..
+   string.format('(left=%d, right=%d, top=%d, bottom=%d, front=%d, back=%d)',
+                 self.pleft, self.pright, self.ptop, self.pbottom,
+                 self.pfront, self.pback)
+end
diff --git a/contrib/lua-torch/nn/WeightNorm.lua b/contrib/lua-torch/nn/WeightNorm.lua
new file mode 100644
index 000000000..3ffcd90aa
--- /dev/null
+++ b/contrib/lua-torch/nn/WeightNorm.lua
@@ -0,0 +1,208 @@
+-- Weight Normalization
+-- https://arxiv.org/pdf/1602.07868v3.pdf
+local WeightNorm, parent = torch.class("nn.WeightNorm", "nn.Decorator")
+
+function WeightNorm:__init(module, outputDim)
+    -- this container will apply Weight Normalization to any module it wraps
+    -- it accepts parameter ``outputDim`` that represents the dimension of the output of the weight
+    -- if outputDim is not 1, the container will transpose the weight
+    -- if the weight is not 2D, the container will view the weight into a 2D shape
+    -- that is nOut x (nIn x kw x dw x ...)
+
+    parent.__init(self, module)
+    assert(module.weight)
+
+    if module.bias then
+        self.bias = module.bias
+        self.gradBias = module.gradBias
+    end
+    self.gradWeight = module.gradWeight
+    self.weight = module.weight
+
+    self.outputDim = outputDim or 1
+
+    -- track the non-output weight dimensions
+    self.otherDims = 1
+    for i = 1, self.weight:dim() do
+        if i ~= self.outputDim then
+            self.otherDims = self.otherDims * self.weight:size(i)
+        end
+    end
+
+    -- view size for weight norm 2D calculations
+    self.viewIn = torch.LongStorage({self.weight:size(self.outputDim), self.otherDims})
+
+    -- view size back to original weight
+    self.viewOut = self.weight:size()
+    self.weightSize = self.weight:size()
+
+    -- bubble outputDim size up to the front
+    for i = self.outputDim - 1, 1, -1 do
+        self.viewOut[i], self.viewOut[i + 1] = self.viewOut[i + 1], self.viewOut[i]
+    end
+
+    -- weight is reparametrized to decouple the length from the direction
+    -- such that w = g * ( v / ||v|| )
+    self.v = torch.Tensor(self.viewIn[1], self.viewIn[2])
+    self.g = torch.Tensor(self.viewIn[1])
+
+    self._norm = torch.Tensor(self.viewIn[1])
+    self._scale = torch.Tensor(self.viewIn[1])
+
+    -- gradient of g
+    self.gradG = torch.Tensor(self.viewIn[1]):zero()
+    -- gradient of v
+    self.gradV = torch.Tensor(self.viewIn)
+
+    self:resetInit()
+end
+
+function WeightNorm:permuteIn(inpt)
+    local ans = inpt
+    for i = self.outputDim - 1, 1, -1 do
+        ans = ans:transpose(i, i+1)
+    end
+    return ans
+end
+
+function WeightNorm:permuteOut(inpt)
+    local ans = inpt
+    for i = 1, self.outputDim - 1 do
+        ans = ans:transpose(i, i+1)
+    end
+    return ans
+end
+
+function WeightNorm:resetInit(inputSize, outputSize)
+    self.v:normal(0, math.sqrt(2/self.viewIn[2]))
+    self.g:norm(self.v, 2, 2)
+    if self.bias then
+        self.bias:zero()
+    end
+end
+
+function WeightNorm:evaluate()
+    if not(self.train == false) then
+        self:updateWeight()
+        parent.evaluate(self)
+    end
+end
+
+function WeightNorm:updateWeight()
+    -- view to 2D when weight norm container operates
+    self.gradV:copy(self:permuteIn(self.weight))
+    self.gradV = self.gradV:view(self.viewIn)
+
+    -- ||w||
+    self._norm:norm(self.v, 2, 2):pow(2):add(10e-5):sqrt()
+    -- g * w / ||w||
+    self.gradV:copy(self.v)
+    self._scale:copy(self.g):cdiv(self._norm)
+    self.gradV:cmul(self._scale:view(self.viewIn[1], 1)
+                               :expand(self.viewIn[1], self.viewIn[2]))
+
+    -- otherwise maintain size of original module weight
+    self.gradV = self.gradV:view(self.viewOut)
+
+    self.weight:copy(self:permuteOut(self.gradV))
+end
+
+function WeightNorm:updateOutput(input)
+    if not(self.train == false) then
+        self:updateWeight()
+    end
+    self.output:set(self.modules[1]:updateOutput(input))
+    return self.output
+end
+
+function WeightNorm:accGradParameters(input, gradOutput, scale)
+    scale = scale or 1
+    self.modules[1]:accGradParameters(input, gradOutput, scale)
+
+    self.weight:copy(self:permuteIn(self.weight))
+    self.gradV:copy(self:permuteIn(self.gradWeight))
+    self.weight = self.weight:view(self.viewIn)
+
+    local norm = self._norm:view(self.viewIn[1], 1):expand(self.viewIn[1], self.viewIn[2])
+    local scale = self._scale:view(self.viewIn[1], 1):expand(self.viewIn[1], self.viewIn[2])
+
+    -- dL / dw * (w / ||w||)
+    self.weight:copy(self.gradV)
+    self.weight:cmul(self.v):cdiv(norm)
+    self.gradG:sum(self.weight, 2)
+
+    -- dL / dw * g / ||w||
+    self.gradV:cmul(scale)
+
+    -- dL / dg * (w * g / ||w||^2)
+    self.weight:copy(self.v):cmul(scale):cdiv(norm)
+    self.weight:cmul(self.gradG:view(self.viewIn[1], 1)
+                               :expand(self.viewIn[1], self.viewIn[2]))
+
+    -- dL / dv update
+    self.gradV:add(-1, self.weight)
+
+    self.gradV = self.gradV:view(self.viewOut)
+    self.weight = self.weight:view(self.viewOut)
+    self.gradWeight:copy(self:permuteOut(self.gradV))
+end
+
+function WeightNorm:updateGradInput(input, gradOutput)
+    self.gradInput:set(self.modules[1]:updateGradInput(input, gradOutput))
+    return self.gradInput
+end
+
+function WeightNorm:zeroGradParameters()
+    self.modules[1]:zeroGradParameters()
+    self.gradV:zero()
+    self.gradG:zero()
+end
+
+function WeightNorm:updateParameters(lr)
+    self.modules[1]:updateParameters(lr)
+    self.g:add(-lr, self.gradG)
+    self.v:add(-lr, self.gradV)
+end
+
+function WeightNorm:parameters()
+    if self.bias then
+        return {self.v, self.g, self.bias}, {self.gradV, self.gradG, self.gradBias}
+    else
+        return {self.v, self.g}, {self.gradV, self.gradG}
+    end
+end
+
+function WeightNorm:write(file)
+    -- Don't save weight and gradWeight since we can easily re-compute it from v
+    -- and g.
+    local weight = self.modules[1].weight
+    local gradWeight = self.modules[1].gradWeight
+    self.weight = nil
+    self.gradWeight = nil
+    self.modules[1].weight = nil
+    self.modules[1].gradWeight = nil
+    if not self.weightSize then
+        self.weightSize = weight:size()
+    end
+
+    parent.write(self, file)
+
+    self.modules[1].weight = weight
+    self.modules[1].gradWeight = gradWeight
+    self.weight = weight
+    self.gradWeight = gradWeight
+end
+
+function WeightNorm:read(file)
+    parent.read(self, file)
+
+    -- Re-compute weight and gradWeight
+    if not self.weight then
+        self.modules[1].weight = self.v.new(self.weightSize)
+        self.modules[1].gradWeight = self.v.new(self.weightSize)
+        self.weight = self.modules[1].weight
+        self.gradWeight = self.modules[1].gradWeight
+        self:updateWeight()
+        self.gradWeight:copy(self:permuteOut(self.gradV))
+    end
+end
diff --git a/contrib/lua-torch/nn/WeightedEuclidean.lua b/contrib/lua-torch/nn/WeightedEuclidean.lua
new file mode 100644
index 000000000..dbf4158a9
--- /dev/null
+++ b/contrib/lua-torch/nn/WeightedEuclidean.lua
@@ -0,0 +1,244 @@
+local WeightedEuclidean, parent = torch.class('nn.WeightedEuclidean', 'nn.Module')
+
+function WeightedEuclidean:__init(inputSize,outputSize)
+   parent.__init(self)
+
+   self.weight = torch.Tensor(inputSize,outputSize)
+   self.gradWeight = torch.Tensor(inputSize,outputSize)
+
+   -- each template (output dim) has its own diagonal covariance matrix
+   self.diagCov = torch.Tensor(inputSize,outputSize)
+   self.gradDiagCov = torch.Tensor(inputSize,outputSize)
+
+   self:reset()
+end
+
+function WeightedEuclidean:reset(stdv)
+   if stdv then
+      stdv = stdv * math.sqrt(3)
+   else
+      stdv = 1./math.sqrt(self.weight:size(1))
+   end
+   self.weight:uniform(-stdv, stdv)
+   self.diagCov:fill(1)
+end
+
+local function view(res, src, ...)
+   local args = {...}
+   if src:isContiguous() then
+      res:view(src, table.unpack(args))
+   else
+      res:reshape(src, table.unpack(args))
+   end
+end
+
+function WeightedEuclidean:updateOutput(input)
+   -- lazy-initialize
+   self._diagCov = self._diagCov or self.output.new()
+
+   self._input = self._input or input.new()
+   self._weight = self._weight or self.weight.new()
+   self._expand = self._expand or self.output.new()
+   self._expand2 = self._expand or self.output.new()
+   self._expand3 = self._expand3 or self.output.new()
+   self._repeat = self._repeat or self.output.new()
+   self._repeat2 = self._repeat2 or self.output.new()
+   self._repeat3 = self._repeat3 or self.output.new()
+
+   local inputSize, outputSize = self.weight:size(1), self.weight:size(2)
+
+   -- y_j = || c_j * (w_j - x) ||
+   if input:dim() == 1 then
+      view(self._input, input, inputSize, 1)
+      self._expand:expandAs(self._input, self.weight)
+      self._repeat:resizeAs(self._expand):copy(self._expand)
+      self._repeat:add(-1, self.weight)
+      self._repeat:cmul(self.diagCov)
+      self.output:norm(self._repeat, 2, 1)
+      self.output:resize(outputSize)
+   elseif input:dim() == 2 then
+      local batchSize = input:size(1)
+
+      view(self._input, input, batchSize, inputSize, 1)
+      self._expand:expand(self._input, batchSize, inputSize, outputSize)
+      -- make the expanded tensor contiguous (requires lots of memory)
+      self._repeat:resizeAs(self._expand):copy(self._expand)
+
+      self._weight:view(self.weight, 1, inputSize, outputSize)
+      self._expand2:expandAs(self._weight, self._repeat)
+
+      self._diagCov:view(self.diagCov, 1, inputSize, outputSize)
+      self._expand3:expandAs(self._diagCov, self._repeat)
+      if torch.type(input) == 'torch.CudaTensor' then
+         -- requires lots of memory, but minimizes cudaMallocs and loops
+         self._repeat2:resizeAs(self._expand2):copy(self._expand2)
+         self._repeat:add(-1, self._repeat2)
+         self._repeat3:resizeAs(self._expand3):copy(self._expand3)
+         self._repeat:cmul(self._repeat3)
+      else
+         self._repeat:add(-1, self._expand2)
+         self._repeat:cmul(self._expand3)
+      end
+
+      self.output:norm(self._repeat, 2, 2)
+      self.output:resize(batchSize, outputSize)
+   else
+      error"1D or 2D input expected"
+   end
+   return self.output
+end
+
+function WeightedEuclidean:updateGradInput(input, gradOutput)
+   if not self.gradInput then
+      return
+   end
+
+   self._div = self._div or input.new()
+   self._output = self._output or self.output.new()
+   self._expand4 = self._expand4 or input.new()
+   self._gradOutput = self._gradOutput or input.new()
+
+   if not self.fastBackward then
+      self:updateOutput(input)
+   end
+
+   local inputSize, outputSize = self.weight:size(1), self.weight:size(2)
+
+   --[[
+   dy_j   -2 * c_j * c_j * (w_j - x)   c_j * c_j * (x - w_j)
+   ---- = -------------------------- = ---------------------
+    dx     2 || c_j * (w_j - x) ||              y_j
+   --]]
+
+   -- to prevent div by zero (NaN) bugs
+   self._output:resizeAs(self.output):copy(self.output):add(0.0000001)
+   view(self._gradOutput, gradOutput, gradOutput:size())
+   self._div:cdiv(gradOutput, self._output)
+   if input:dim() == 1 then
+      self._div:resize(1, outputSize)
+      self._expand4:expandAs(self._div, self.weight)
+
+      if torch.type(input) == 'torch.CudaTensor' then
+         self._repeat2:resizeAs(self._expand4):copy(self._expand4)
+         self._repeat2:cmul(self._repeat)
+      else
+         self._repeat2:cmul(self._repeat, self._expand4)
+      end
+
+      self._repeat2:cmul(self.diagCov)
+      self.gradInput:sum(self._repeat2, 2)
+      self.gradInput:resizeAs(input)
+   elseif input:dim() == 2 then
+      local batchSize = input:size(1)
+
+      self._div:resize(batchSize, 1, outputSize)
+      self._expand4:expand(self._div, batchSize, inputSize, outputSize)
+
+      if torch.type(input) == 'torch.CudaTensor' then
+         self._repeat2:resizeAs(self._expand4):copy(self._expand4)
+         self._repeat2:cmul(self._repeat)
+         self._repeat2:cmul(self._repeat3)
+      else
+         self._repeat2:cmul(self._repeat, self._expand4)
+         self._repeat2:cmul(self._expand3)
+      end
+
+      self.gradInput:sum(self._repeat2, 3)
+      self.gradInput:resizeAs(input)
+   else
+      error"1D or 2D input expected"
+   end
+
+   return self.gradInput
+end
+
+function WeightedEuclidean:accGradParameters(input, gradOutput, scale)
+   local inputSize, outputSize = self.weight:size(1), self.weight:size(2)
+   scale = scale or 1
+
+   --[[
+   dy_j   2 * c_j * c_j * (w_j - x)    c_j * c_j * (w_j - x)
+   ---- = ------------------------- = ---------------------
+   dw_j    2 || c_j * (w_j - x) ||             y_j
+
+   dy_j    2 * c_j * (w_j - x)^2    c_j * (w_j - x)^2
+   ---- = ----------------------- = -----------------
+   dc_j   2 || c_j * (w_j - x) ||         y_j
+   --]]
+   -- assumes a preceding call to updateGradInput
+   if input:dim() == 1 then
+      self.gradWeight:add(-scale, self._repeat2)
+
+      self._repeat:cdiv(self.diagCov)
+      self._repeat:cmul(self._repeat)
+      self._repeat:cmul(self.diagCov)
+
+      if torch.type(input) == 'torch.CudaTensor' then
+         self._repeat2:resizeAs(self._expand4):copy(self._expand4)
+         self._repeat2:cmul(self._repeat)
+      else
+         self._repeat2:cmul(self._repeat, self._expand4)
+      end
+
+      self.gradDiagCov:add(self._repeat2)
+   elseif input:dim() == 2 then
+      self._sum = self._sum or input.new()
+      self._sum:sum(self._repeat2, 1)
+      self._sum:resize(inputSize, outputSize)
+      self.gradWeight:add(-scale, self._sum)
+
+      if torch.type(input) == 'torch.CudaTensor' then
+         -- requires lots of memory, but minimizes cudaMallocs and loops
+         self._repeat:cdiv(self._repeat3)
+         self._repeat:cmul(self._repeat)
+         self._repeat:cmul(self._repeat3)
+         self._repeat2:resizeAs(self._expand4):copy(self._expand4)
+         self._repeat:cmul(self._repeat2)
+      else
+         self._repeat:cdiv(self._expand3)
+         self._repeat:cmul(self._repeat)
+         self._repeat:cmul(self._expand3)
+         self._repeat:cmul(self._expand4)
+      end
+
+      self._sum:sum(self._repeat, 1)
+      self._sum:resize(inputSize, outputSize)
+      self.gradDiagCov:add(scale, self._sum)
+   else
+      error"1D or 2D input expected"
+   end
+end
+
+function WeightedEuclidean:type(type, tensorCache)
+   if type then
+      -- prevent premature memory allocations
+      self._input = nil
+      self._output = nil
+      self._gradOutput = nil
+      self._weight = nil
+      self._div = nil
+      self._sum = nil
+      self._expand = nil
+      self._expand2 = nil
+      self._expand3 = nil
+      self._expand4 = nil
+      self._repeat = nil
+      self._repeat2 = nil
+      self._repeat3 = nil
+   end
+   return parent.type(self, type, tensorCache)
+end
+
+function WeightedEuclidean:parameters()
+   return {self.weight, self.diagCov}, {self.gradWeight, self.gradDiagCov}
+end
+
+function WeightedEuclidean:accUpdateGradParameters(input, gradOutput, lr)
+   local gradWeight = self.gradWeight
+   local gradDiagCov = self.gradDiagCov
+   self.gradWeight = self.weight
+   self.gradDiagCov = self.diagCov
+   self:accGradParameters(input, gradOutput, -lr)
+   self.gradWeight = gradWeight
+   self.gradDiagCov = gradDiagCov
+end
diff --git a/contrib/lua-torch/nn/WeightedMSECriterion.lua b/contrib/lua-torch/nn/WeightedMSECriterion.lua
new file mode 100644
index 000000000..933472937
--- /dev/null
+++ b/contrib/lua-torch/nn/WeightedMSECriterion.lua
@@ -0,0 +1,45 @@
+local WeightedMSECriterion, parent = torch.class('nn.WeightedMSECriterion','nn.MSECriterion')
+
+function WeightedMSECriterion:__init(w)
+   parent.__init(self)
+   self.weight = w:clone()
+end
+
+function WeightedMSECriterion:updateOutput(input,target)
+   self.buffer = self.buffer or input.new()
+   self.buffer:resizeAs(input):copy(target)
+   if input:dim() - 1 == self.weight:dim() then
+      for i=1,input:size(1) do
+         self.buffer[i]:cmul(self.weight)
+      end
+   else
+      self.buffer:cmul(self.weight)
+   end
+   self.output_tensor = self.output_tensor or input.new(1)
+   input.THNN.MSECriterion_updateOutput(
+      input:cdata(),
+      self.buffer:cdata(),
+      self.output_tensor:cdata(),
+      self.sizeAverage
+   )
+   self.output = self.output_tensor[1]
+   return self.output
+end
+
+function WeightedMSECriterion:updateGradInput(input, target)
+   self.buffer:resizeAs(input):copy(target)
+   if input:dim() - 1 == self.weight:dim() then
+      for i=1,input:size(1) do
+         self.buffer[i]:cmul(self.weight)
+      end
+   else
+      self.buffer:cmul(self.weight)
+   end
+   input.THNN.MSECriterion_updateGradInput(
+      input:cdata(),
+      self.buffer:cdata(),
+      self.gradInput:cdata(),
+      self.sizeAverage
+   )
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/WhiteNoise.lua b/contrib/lua-torch/nn/WhiteNoise.lua
new file mode 100644
index 000000000..f1defb646
--- /dev/null
+++ b/contrib/lua-torch/nn/WhiteNoise.lua
@@ -0,0 +1,40 @@
+local WhiteNoise, parent = torch.class('nn.WhiteNoise', 'nn.Module')
+
+function WhiteNoise:__init(mean, std)
+   parent.__init(self)
+   self.mean = mean or 0
+   self.std = std or 0.1
+   self.noise = torch.Tensor()
+end
+
+function WhiteNoise:updateOutput(input)
+   self.output:resizeAs(input):copy(input)
+   if self.train ~= false then
+      self.noise:resizeAs(input)
+      self.noise:normal(self.mean, self.std)
+      self.output:add(self.noise)
+   else
+      if self.mean ~= 0 then
+         self.output:add(self.mean)
+      end
+   end
+   return self.output
+end
+
+function WhiteNoise:updateGradInput(input, gradOutput)
+   if self.train ~= false then
+      -- Simply return the gradients.
+      self.gradInput:resizeAs(gradOutput):copy(gradOutput)
+   else
+      error('backprop only defined while training')
+   end
+   return self.gradInput
+end
+
+function WhiteNoise:clearState()
+   self.noise:set()
+end
+
+function WhiteNoise:__tostring__()
+  return string.format('%s mean: %f, std: %f', torch.type(self), self.mean, self.std)
+end
diff --git a/contrib/lua-torch/nn/ZeroGrad.lua b/contrib/lua-torch/nn/ZeroGrad.lua
new file mode 100644
index 000000000..7c941ce1c
--- /dev/null
+++ b/contrib/lua-torch/nn/ZeroGrad.lua
@@ -0,0 +1,14 @@
+local ZeroGrad, parent = torch.class('nn.ZeroGrad', 'nn.Module')
+
+function ZeroGrad:updateOutput(input)
+   self.output:set(input)
+   return self.output
+end
+
+-- the gradient is simply zeroed.
+-- useful when you don't want to backpropgate through certain paths.
+function ZeroGrad:updateGradInput(input, gradOutput)
+   self.gradInput = nn.utils.recursiveResizeAs(self.gradInput, input)
+   self.gradInput = nn.utils.recursiveFill(self.gradInput, 0)
+   return self.gradInput
+end
diff --git a/contrib/lua-torch/nn/ZipTable.lua b/contrib/lua-torch/nn/ZipTable.lua
new file mode 100644
index 000000000..7b18619eb
--- /dev/null
+++ b/contrib/lua-torch/nn/ZipTable.lua
@@ -0,0 +1,34 @@
+local ZipTable, parent = torch.class('nn.ZipTable', 'nn.Module')
+
+-- input : { {a1,a2}, {b1,b2}, {c1,c2} }
+-- output : { {a1,b1,c1}, {a2,b2,c2} }
+function ZipTable:__init()
+   parent.__init(self)
+   self.output = {}
+   self.gradInput = {}
+end
+
+function ZipTable:updateOutput(inputTable)
+   self.output = {}
+   for i,inTable in ipairs(inputTable) do
+      for j,input in ipairs(inTable) do
+         local output = self.output[j] or {}
+         output[i] = input
+         self.output[j] = output
+      end
+   end
+   return self.output
+end
+
+function ZipTable:updateGradInput(inputTable, gradOutputTable)
+   self.gradInput = {}
+   for i,gradOutTable in ipairs(gradOutputTable) do
+      for j,gradOutput in ipairs(gradOutTable) do
+         local gradInput = self.gradInput[j] or {}
+         gradInput[i] = gradOutput
+         self.gradInput[j] = gradInput
+      end
+   end
+   return self.gradInput
+end
+
diff --git a/contrib/lua-torch/nn/ZipTableOneToMany.lua b/contrib/lua-torch/nn/ZipTableOneToMany.lua
new file mode 100644
index 000000000..d4a80fe0d
--- /dev/null
+++ b/contrib/lua-torch/nn/ZipTableOneToMany.lua
@@ -0,0 +1,37 @@
+local ZipTableOneToMany, parent = torch.class('nn.ZipTableOneToMany', 'nn.Module')
+
+-- based on ZipTable in dpnn
+
+-- input : { v, {a, b, c} }
+-- output : { {v,a}, {v,b}, {v,c} }
+function ZipTableOneToMany:__init()
+   parent.__init(self)
+   self.output = {}
+   self.gradInput = {}
+   -- make buffer to update during forward/backward
+   self.gradInputEl = torch.Tensor()
+end
+
+function ZipTableOneToMany:updateOutput(input)
+   assert(#input == 2, "input must be table of element and table")
+   local inputEl, inputTable = input[1], input[2]
+   self.output = {}
+   for i,v in ipairs(inputTable) do
+      self.output[i] = {inputEl, v}
+   end
+   return self.output
+end
+
+function ZipTableOneToMany:updateGradInput(input, gradOutput)
+   assert(#input == 2, "input must be table of element and table")
+   local inputEl, inputTable = input[1], input[2]
+   self.gradInputEl:resizeAs(inputEl):zero()
+   local gradInputTable = {}
+   for i,gradV in ipairs(gradOutput) do
+      self.gradInputEl:add(gradV[1])
+      gradInputTable[i] = gradV[2]
+   end
+   self.gradInput = {self.gradInputEl, gradInputTable}
+   return self.gradInput
+end
+
diff --git a/contrib/lua-torch/nn/hessian.lua b/contrib/lua-torch/nn/hessian.lua
new file mode 100644
index 000000000..b841d8c59
--- /dev/null
+++ b/contrib/lua-torch/nn/hessian.lua
@@ -0,0 +1,391 @@
+----------------------------------------------------------------------
+-- hessian.lua: this file appends extra methods to modules in nn,
+-- to estimate diagonal elements of the Hessian. This is useful
+-- to condition learning rates individually.
+----------------------------------------------------------------------
+nn.hessian = {}
+
+----------------------------------------------------------------------
+-- Hessian code is still experimental,
+-- and deactivated by default
+----------------------------------------------------------------------
+function nn.hessian.enable()
+
+   local function accDiagHessianParameters(module, input, diagHessianOutput, gw, hw)
+      if #gw ~= #hw then
+         error('Number of gradients is nto equal to number of hessians')
+      end
+      module.inputSq = module.inputSq or input.new()
+      module.inputSq:resizeAs(input)
+      torch.cmul(module.inputSq, input, input)
+      -- replace gradients with hessian
+      for i=1,#gw do
+         local gwname = gw[i]
+         local hwname = hw[i]
+         local gwval = module[gwname]
+         local hwval = module[hwname]
+         if hwval == nil then
+            module[hwname] = gwval.new():resizeAs(gwval)
+            hwval = module[hwname]
+         end
+         module[gwname] = hwval
+         module[hwname] = gwval
+      end
+      local oldOutput = module.output
+      module.output = module.output.new():resizeAs(oldOutput)
+      module.forward(module, module.inputSq)
+      module.accGradParameters(module, module.inputSq, diagHessianOutput, 1)
+      -- put back gradients
+      for i=1,#gw do
+         local gwname = gw[i]
+         local hwname = hw[i]
+         local gwval = module[gwname]
+         local hwval = module[hwname]
+         module[gwname] = hwval
+         module[hwname] = gwval
+      end
+      module.output = oldOutput
+   end
+   nn.hessian.accDiagHessianParameters = accDiagHessianParameters
+
+   local function updateDiagHessianInput(module, input, diagHessianOutput, w, wsq)
+      if #w ~= #wsq then
+         error('Number of weights is not equal to number of weights squares')
+      end
+      module.diagHessianInput = module.diagHessianInput or input.new()
+      module.diagHessianInput:resizeAs(input):zero()
+
+      local gi = module.gradInput
+      module.gradInput = module.diagHessianInput
+      for i=1,#w do
+         local wname = w[i]
+         local wsqname = wsq[i]
+         local wval = module[wname]
+         local wsqval = module[wsqname]
+         if wsqval == nil then
+            module[wsqname] = wval.new()
+            wsqval = module[wsqname]
+         end
+         wsqval:resizeAs(wval)
+         torch.cmul(wsqval, wval, wval)
+         module[wsqname] = wval
+         module[wname] = wsqval
+      end
+      module.updateGradInput(module,input,diagHessianOutput)
+      for i=1,#w do
+         local wname = w[i]
+         local wsqname = wsq[i]
+         local wval = module[wname]
+         local wsqval = module[wsqname]
+         module[wname] = wsqval
+         module[wsqname] = wval
+      end
+      module.gradInput = gi
+   end
+   nn.hessian.updateDiagHessianInput = updateDiagHessianInput
+
+   local function updateDiagHessianInputPointWise(module, input, diagHessianOutput)
+      local tdh = diagHessianOutput.new():resizeAs(diagHessianOutput):fill(1)
+      updateDiagHessianInput(module,input,tdh,{},{})
+      module.diagHessianInput:cmul(module.diagHessianInput)
+      module.diagHessianInput:cmul(diagHessianOutput)
+   end
+   nn.hessian.updateDiagHessianInputPointWise = updateDiagHessianInputPointWise
+
+   local function initDiagHessianParameters(module,gw,hw)
+      module.diagHessianInput = module.diagHessianInput or module.gradInput.new();
+      for i=1,#gw do
+         module[hw[i]] = module[hw[i]] or module[gw[i]].new():resizeAs(module[gw[i]])
+      end
+   end
+   nn.hessian.initDiagHessianParameters = initDiagHessianParameters
+
+   ----------------------------------------------------------------------
+   -- Module
+   ----------------------------------------------------------------------
+   function nn.Module.updateDiagHessianInput(self, input, diagHessianOutput)
+      error(torch.typename(self) .. ':updateDiagHessianInput() is undefined')
+   end
+
+   function nn.Module.accDiagHessianParameters(self, input, diagHessianOutput)
+   end
+
+   function nn.Module.initDiagHessianParameters()
+   end
+
+   ----------------------------------------------------------------------
+   -- Sequential
+   ----------------------------------------------------------------------
+   function nn.Sequential.initDiagHessianParameters(self)
+      for i=1,#self.modules do
+         self.modules[i]:initDiagHessianParameters()
+      end
+   end
+
+   function nn.Sequential.updateDiagHessianInput(self, input, diagHessianOutput)
+      local currentDiagHessianOutput = diagHessianOutput
+      local currentModule = self.modules[#self.modules]
+      for i=#self.modules-1,1,-1 do
+         local previousModule = self.modules[i]
+         currentDiagHessianOutput = currentModule:updateDiagHessianInput(previousModule.output, currentDiagHessianOutput)
+         currentModule = previousModule
+      end
+      currentDiagHessianOutput = currentModule:updateDiagHessianInput(input, currentDiagHessianOutput)
+      self.diagHessianInput = currentDiagHessianOutput
+      return currentDiagHessianOutput
+   end
+
+   function nn.Sequential.accDiagHessianParameters(self, input, diagHessianOutput)
+      local currentDiagHessianOutput = diagHessianOutput
+      local currentModule = self.modules[#self.modules]
+      for i=#self.modules-1,1,-1 do
+         local previousModule = self.modules[i]
+         currentModule:accDiagHessianParameters(previousModule.output, currentDiagHessianOutput)
+         currentDiagHessianOutput = currentModule.diagHessianInput
+         currentModule = previousModule
+      end
+      currentModule:accDiagHessianParameters(input, currentDiagHessianOutput)
+   end
+
+   ----------------------------------------------------------------------
+   -- Criterion
+   ----------------------------------------------------------------------
+   function nn.Criterion.updateDiagHessianInput(self, input, diagHessianOutput)
+      error(torch.typename(self) .. ':updateDiagHessianInput() is undefined')
+   end
+
+   ----------------------------------------------------------------------
+   -- MSECriterion
+   ----------------------------------------------------------------------
+   function nn.MSECriterion.updateDiagHessianInput(self, input, target)
+      self.diagHessianInput = self.diagHessianInput or input.new()
+      local val = 2
+      if self.sizeAverage then
+         val = val / input:nElement()
+      end
+      self.diagHessianInput:resizeAs(input):fill(val)
+      return self.diagHessianInput
+   end
+
+   ----------------------------------------------------------------------
+   -- WeightedMSECriterion
+   ----------------------------------------------------------------------
+   function nn.WeightedMSECriterion.updateDiagHessianInput(self,input,target)
+      return nn.MSECriterion.updateDiagHessianInput(self,input,target)
+   end
+
+   ----------------------------------------------------------------------
+   -- L1Cost
+   ----------------------------------------------------------------------
+   function nn.L1Cost.updateDiagHessianInput(self,input)
+      self.diagHessianInput = self.diagHessianInput or input.new()
+      self.diagHessianInput:resizeAs(input)
+      self.diagHessianInput:fill(1)
+      self.diagHessianInput[torch.eq(input,0)] = 0
+      return self.diagHessianInput
+   end
+
+   ----------------------------------------------------------------------
+   -- Linear
+   ----------------------------------------------------------------------
+   function nn.Linear.updateDiagHessianInput(self, input, diagHessianOutput)
+      updateDiagHessianInput(self, input, diagHessianOutput, {'weight'}, {'weightSq'})
+      return self.diagHessianInput
+   end
+
+   function nn.Linear.accDiagHessianParameters(self, input, diagHessianOutput)
+      accDiagHessianParameters(self,input, diagHessianOutput, {'gradWeight','gradBias'}, {'diagHessianWeight','diagHessianBias'})
+   end
+
+   function nn.Linear.initDiagHessianParameters(self)
+      initDiagHessianParameters(self,{'gradWeight','gradBias'},{'diagHessianWeight','diagHessianBias'})
+   end
+
+   ----------------------------------------------------------------------
+   -- SpatialConvolution
+   ----------------------------------------------------------------------
+   function nn.SpatialConvolution.updateDiagHessianInput(self, input, diagHessianOutput)
+      updateDiagHessianInput(self, input, diagHessianOutput, {'weight'}, {'weightSq'})
+      return self.diagHessianInput
+   end
+
+   function nn.SpatialConvolution.accDiagHessianParameters(self, input, diagHessianOutput)
+      accDiagHessianParameters(self,input, diagHessianOutput, {'gradWeight','gradBias'}, {'diagHessianWeight','diagHessianBias'})
+   end
+
+   function nn.SpatialConvolution.initDiagHessianParameters(self)
+      initDiagHessianParameters(self,{'gradWeight','gradBias'},{'diagHessianWeight','diagHessianBias'})
+   end
+
+   ----------------------------------------------------------------------
+   -- SpatialConvolutionLocal
+   ----------------------------------------------------------------------
+   function nn.SpatialConvolutionLocal.updateDiagHessianInput(self, input, diagHessianOutput)
+      updateDiagHessianInput(self, input, diagHessianOutput, {'weight'}, {'weightSq'})
+      return self.diagHessianInput
+   end
+
+   function nn.SpatialConvolutionLocal.accDiagHessianParameters(self, input, diagHessianOutput)
+      accDiagHessianParameters(self,input, diagHessianOutput, {'gradWeight','gradBias'}, {'diagHessianWeight','diagHessianBias'})
+   end
+
+   function nn.SpatialConvolutionLocal.initDiagHessianParameters(self)
+      initDiagHessianParameters(self,{'gradWeight','gradBias'},{'diagHessianWeight','diagHessianBias'})
+   end
+
+   ----------------------------------------------------------------------
+   -- SpatialFullConvolution
+   ----------------------------------------------------------------------
+   function nn.SpatialFullConvolution.updateDiagHessianInput(self, input, diagHessianOutput)
+      updateDiagHessianInput(self, input, diagHessianOutput, {'weight'}, {'weightSq'})
+      return self.diagHessianInput
+   end
+
+   function nn.SpatialFullConvolution.accDiagHessianParameters(self, input, diagHessianOutput)
+      accDiagHessianParameters(self,input, diagHessianOutput, {'gradWeight','gradBias'}, {'diagHessianWeight','diagHessianBias'})
+   end
+
+   function nn.SpatialFullConvolution.initDiagHessianParameters(self)
+      initDiagHessianParameters(self,{'gradWeight','gradBias'},{'diagHessianWeight','diagHessianBias'})
+   end
+
+   ----------------------------------------------------------------------
+   -- SpatialConvolutionMap
+   ----------------------------------------------------------------------
+   function nn.SpatialConvolutionMap.updateDiagHessianInput(self, input, diagHessianOutput)
+      updateDiagHessianInput(self, input, diagHessianOutput, {'weight','bias'}, {'weightSq','biasSq'})
+      return self.diagHessianInput
+   end
+
+   function nn.SpatialConvolutionMap.accDiagHessianParameters(self, input, diagHessianOutput)
+      accDiagHessianParameters(self,input, diagHessianOutput, {'gradWeight','gradBias'}, {'diagHessianWeight','diagHessianBias'})
+   end
+
+   function nn.SpatialConvolutionMap.initDiagHessianParameters(self)
+      initDiagHessianParameters(self,{'gradWeight','gradBias'},{'diagHessianWeight','diagHessianBias'})
+   end
+
+   ----------------------------------------------------------------------
+   -- SpatialFullConvolutionMap
+   ----------------------------------------------------------------------
+   function nn.SpatialFullConvolutionMap.updateDiagHessianInput(self, input, diagHessianOutput)
+      updateDiagHessianInput(self, input, diagHessianOutput, {'weight'}, {'weightSq'})
+      return self.diagHessianInput
+   end
+
+   function nn.SpatialFullConvolutionMap.accDiagHessianParameters(self, input, diagHessianOutput)
+      accDiagHessianParameters(self,input, diagHessianOutput, {'gradWeight','gradBias'}, {'diagHessianWeight','diagHessianBias'})
+   end
+
+   function nn.SpatialFullConvolutionMap.initDiagHessianParameters(self)
+      initDiagHessianParameters(self,{'gradWeight','gradBias'},{'diagHessianWeight','diagHessianBias'})
+   end
+
+----------------------------------------------------------------------
+   -- Tanh
+   ----------------------------------------------------------------------
+   function nn.Tanh.updateDiagHessianInput(self, input, diagHessianOutput)
+      updateDiagHessianInputPointWise(self, input, diagHessianOutput)
+      return self.diagHessianInput
+   end
+
+   ----------------------------------------------------------------------
+   -- TanhShrink
+   ----------------------------------------------------------------------
+   function nn.TanhShrink.updateDiagHessianInput(self, input, diagHessianOutput)
+      updateDiagHessianInputPointWise(self.tanh, input, diagHessianOutput)
+      self.diagHessianInput = self.diagHessianInput or input.new():resizeAs(input)
+      torch.add(self.diagHessianInput, self.tanh.diagHessianInput, diagHessianOutput)
+      return self.diagHessianInput
+   end
+
+   ----------------------------------------------------------------------
+   -- Square
+   ----------------------------------------------------------------------
+   function nn.Square.updateDiagHessianInput(self, input, diagHessianOutput)
+      updateDiagHessianInputPointWise(self, input, diagHessianOutput)
+      return self.diagHessianInput
+   end
+
+   ----------------------------------------------------------------------
+   -- Sqrt
+   ----------------------------------------------------------------------
+   function nn.Sqrt.updateDiagHessianInput(self, input, diagHessianOutput)
+      updateDiagHessianInputPointWise(self, input, diagHessianOutput)
+      return self.diagHessianInput
+   end
+
+   ----------------------------------------------------------------------
+   -- Reshape
+   ----------------------------------------------------------------------
+   function nn.Reshape.updateDiagHessianInput(self, input, diagHessianOutput)
+      self.diagHessianInput = self.diagHessianInput or input.new()
+      diagHessianOutput = diagHessianOutput:contiguous()
+      self.diagHessianInput:set(diagHessianOutput):resizeAs(input)
+      return self.diagHessianInput
+   end
+
+   ----------------------------------------------------------------------
+   -- Parameters manipulation:
+   -- we modify these functions such that they return hessian coefficients
+   ----------------------------------------------------------------------
+   function nn.Module.parameters(self)
+      if self.weight and self.bias then
+         return {self.weight, self.bias}, {self.gradWeight, self.gradBias}, {self.diagHessianWeight, self.diagHessianBias}
+      elseif self.weight then
+         return {self.weight}, {self.gradWeight}, {self.diagHessianWeight}
+      elseif self.bias then
+         return {self.bias}, {self.gradBias}, {self.diagHessianBias}
+      else
+         return
+      end
+   end
+
+   function nn.Module.getParameters(self)
+      -- get parameters
+      local parameters,gradParameters,hessianParameters = self:parameters()
+      -- flatten parameters and gradients
+      local flatParameters = nn.Module.flatten(parameters)
+      collectgarbage()
+      local flatGradParameters = nn.Module.flatten(gradParameters)
+      collectgarbage()
+      local flatHessianParameters
+      if hessianParameters and hessianParameters[1] then
+         flatHessianParameters = nn.Module.flatten(hessianParameters)
+         collectgarbage()
+      end
+
+      -- return new flat vector that contains all discrete parameters
+      return flatParameters, flatGradParameters, flatHessianParameters
+   end
+
+   function nn.Sequential.parameters(self)
+      local function tinsert(to, from)
+         if type(from) == 'table' then
+            for i=1,#from do
+               tinsert(to,from[i])
+            end
+         else
+            table.insert(to,from)
+         end
+      end
+      local w = {}
+      local gw = {}
+      local ggw = {}
+      for i=1,#self.modules do
+         local mw,mgw,mggw = self.modules[i]:parameters()
+         if mw then
+            tinsert(w,mw)
+            tinsert(gw,mgw)
+            tinsert(ggw,mggw)
+         end
+      end
+      return w,gw,ggw
+   end
+
+   ----------------------------------------------------------------------
+   -- Avoid multiple calls to enable()
+   ----------------------------------------------------------------------
+   function nn.hessian.enable()
+   end
+end
diff --git a/contrib/lua-torch/nn/init.lua b/contrib/lua-torch/nn/init.lua
new file mode 100755
index 000000000..4319a8868
--- /dev/null
+++ b/contrib/lua-torch/nn/init.lua
@@ -0,0 +1,221 @@
+require('torch')
+
+nn = {} -- define the global nn table
+
+require('nn.THNN')
+
+require('nn.utils')
+
+
+require('nn.ErrorMessages')
+require('nn.Module')
+
+require('nn.Container')
+require('nn.Concat')
+require('nn.Parallel')
+require('nn.Sequential')
+require('nn.DepthConcat')
+
+require('nn.Decorator')
+require('nn.Bottle')
+require('nn.WeightNorm')
+require('nn.DontCast')
+require('nn.NaN')
+require('nn.Profile')
+
+require('nn.Linear')
+require('nn.LinearWeightNorm')
+require('nn.Bilinear')
+require('nn.PartialLinear')
+require('nn.SparseLinear')
+require('nn.IndexLinear')
+require('nn.Reshape')
+require('nn.View')
+require('nn.Contiguous')
+require('nn.Select')
+require('nn.Narrow')
+require('nn.Index')
+require('nn.Squeeze')
+require('nn.Unsqueeze')
+require('nn.Replicate')
+require('nn.Transpose')
+require('nn.BatchNormalization')
+require('nn.LayerNormalization')
+require('nn.Padding')
+require('nn.GradientReversal')
+require('nn.MaskedSelect')
+
+require('nn.Copy')
+require('nn.Min')
+require('nn.Max')
+require('nn.Sum')
+require('nn.Mean')
+require('nn.CMul')
+require('nn.Mul')
+require('nn.MulConstant')
+require('nn.CAdd')
+require('nn.Add')
+require('nn.AddConstant')
+require('nn.Constant')
+require('nn.Dropout')
+require('nn.SpatialDropout')
+require('nn.VolumetricDropout')
+require('nn.WhiteNoise')
+require('nn.OneHot')
+require('nn.PrintSize')
+require('nn.ZeroGrad')
+
+require('nn.CAddTable')
+require('nn.CDivTable')
+require('nn.CMulTable')
+require('nn.CSubTable')
+require('nn.CMaxTable')
+require('nn.CMinTable')
+require('nn.CAddTensorTable')
+
+require('nn.Euclidean')
+require('nn.WeightedEuclidean')
+require('nn.PairwiseDistance')
+require('nn.CosineDistance')
+require('nn.DotProduct')
+require('nn.Normalize')
+require('nn.Cosine')
+require('nn.Kmeans')
+
+require('nn.Exp')
+require('nn.Log')
+require('nn.HardTanh')
+require('nn.Clamp')
+require('nn.LogSigmoid')
+require('nn.LogSoftMax')
+require('nn.Sigmoid')
+require('nn.SoftMax')
+require('nn.SoftMin')
+require('nn.SoftPlus')
+require('nn.SoftSign')
+require('nn.Tanh')
+require('nn.TanhShrink')
+require('nn.Abs')
+require('nn.Power')
+require('nn.Square')
+require('nn.Sqrt')
+require('nn.HardShrink')
+require('nn.SoftShrink')
+require('nn.Threshold')
+require('nn.Maxout')
+require('nn.ReLU')
+require('nn.ReLU6')
+require('nn.PReLU')
+require('nn.CReLU')
+require('nn.LeakyReLU')
+require('nn.SpatialSoftMax')
+require('nn.SpatialLogSoftMax')
+require('nn.RReLU')
+require('nn.ELU')
+require('nn.GatedLinearUnit')
+
+require('nn.LookupTable')
+require('nn.SpatialConvolution')
+require('nn.SpatialConvolutionLocal')
+require('nn.SpatialFullConvolution')
+require('nn.SpatialFullConvolutionMap')
+require('nn.SpatialConvolutionMM')
+require('nn.SpatialDepthWiseConvolution')
+require('nn.SpatialConvolutionMap')
+require('nn.SpatialDilatedConvolution')
+require('nn.SpatialSubSampling')
+require('nn.SpatialMaxPooling')
+require('nn.SpatialDilatedMaxPooling')
+require('nn.SpatialMaxUnpooling')
+require('nn.SpatialFractionalMaxPooling')
+require('nn.SpatialLPPooling')
+require('nn.SpatialAveragePooling')
+require('nn.SpatialAdaptiveMaxPooling')
+require('nn.SpatialAdaptiveAveragePooling')
+require('nn.TemporalConvolution')
+require('nn.TemporalSubSampling')
+require('nn.TemporalMaxPooling')
+require('nn.TemporalDynamicKMaxPooling')
+require('nn.TemporalRowConvolution')
+require('nn.SpatialSubtractiveNormalization')
+require('nn.SpatialDivisiveNormalization')
+require('nn.SpatialContrastiveNormalization')
+require('nn.SpatialCrossMapLRN')
+require('nn.SpatialZeroPadding')
+require('nn.SpatialReflectionPadding')
+require('nn.SpatialReplicationPadding')
+require('nn.SpatialUpSamplingNearest')
+require('nn.SpatialUpSamplingBilinear')
+require('nn.SpatialBatchNormalization')
+
+require('nn.VolumetricConvolution')
+require('nn.VolumetricFullConvolution')
+require('nn.VolumetricDilatedConvolution')
+require('nn.VolumetricMaxPooling')
+require('nn.VolumetricDilatedMaxPooling')
+require('nn.VolumetricFractionalMaxPooling')
+require('nn.VolumetricMaxUnpooling')
+require('nn.VolumetricAveragePooling')
+require('nn.VolumetricBatchNormalization')
+require('nn.VolumetricReplicationPadding')
+
+require('nn.GPU')
+
+require('nn.ParallelTable')
+require('nn.Identity')
+require('nn.ConcatTable')
+require('nn.SplitTable')
+require('nn.JoinTable')
+require('nn.SelectTable')
+require('nn.MixtureTable')
+require('nn.CriterionTable')
+require('nn.FlattenTable')
+require('nn.NarrowTable')
+require('nn.MapTable')
+require('nn.ZipTable')
+require('nn.ZipTableOneToMany')
+require('nn.Collapse')
+require('nn.Convert')
+
+require('nn.Criterion')
+require('nn.MSECriterion')
+require('nn.SpatialAutoCropMSECriterion')
+require('nn.SmoothL1Criterion')
+require('nn.MarginCriterion')
+require('nn.SoftMarginCriterion')
+require('nn.AbsCriterion')
+require('nn.ClassNLLCriterion')
+require('nn.SpatialClassNLLCriterion')
+require('nn.ClassSimplexCriterion')
+require('nn.DistKLDivCriterion')
+require('nn.MultiCriterion')
+require('nn.L1HingeEmbeddingCriterion')
+require('nn.HingeEmbeddingCriterion')
+require('nn.CosineEmbeddingCriterion')
+require('nn.MarginRankingCriterion')
+require('nn.MultiMarginCriterion')
+require('nn.MultiLabelMarginCriterion')
+require('nn.MultiLabelSoftMarginCriterion')
+require('nn.L1Cost')
+require('nn.L1Penalty')
+require('nn.WeightedMSECriterion')
+require('nn.BCECriterion')
+require('nn.CrossEntropyCriterion')
+require('nn.ParallelCriterion')
+require('nn.DistanceRatioCriterion')
+require('nn.ModuleCriterion')
+
+require('nn.PixelShuffle')
+
+require('nn.StochasticGradient')
+
+require('nn.MM')
+require('nn.MV')
+
+require('nn.Jacobian')
+require('nn.SparseJacobian')
+require('nn.hessian')
+require('nn.test')
+
+
+return nn
diff --git a/contrib/lua-torch/nn/lib/CMakeLists.txt b/contrib/lua-torch/nn/lib/CMakeLists.txt
new file mode 100644
index 000000000..de04595f6
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/CMakeLists.txt
@@ -0,0 +1,5 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR)
+CMAKE_POLICY(VERSION 2.6)
+SET(THNN_INSTALL_LIB_SUBDIR "${RSPAMD_LIBDIR}")
+SET(THNN_INSTALL_INCLUDE_SUBDIR "${Torch_INSTALL_INCLUDE_SUBDIR}")
+ADD_SUBDIRECTORY(THNN)
+\ No newline at end of file
diff --git a/contrib/lua-torch/nn/lib/THNN/CMakeLists.txt b/contrib/lua-torch/nn/lib/THNN/CMakeLists.txt
new file mode 100644
index 000000000..00908a5b1
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/CMakeLists.txt
@@ -0,0 +1,47 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR)
+CMAKE_POLICY(VERSION 2.6)
+
+IF(NOT TH_LIBRARIES)
+  SET(TH_LIBRARIES "TH")
+ENDIF(NOT TH_LIBRARIES)
+MESSAGE(STATUS "TH_LIBRARIES: ${TH_LIBRARIES}")
+
+IF(NOT THNN_INSTALL_LIB_SUBDIR)
+  SET(THNN_INSTALL_LIB_SUBDIR "lib" CACHE PATH "THNN install library directory")
+  SET(THNN_INSTALL_INCLUDE_SUBDIR "include" CACHE PATH "THNN install include subdirectory")
+ENDIF()
+
+# Flags
+# When using MSVC
+IF(MSVC)
+  # we want to respect the standard, and we are bored of those **** .
+  ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE=1)
+  ADD_DEFINITIONS(-DTH_EXPORTS)
+ENDIF(MSVC)
+
+IF (CMAKE_VERSION VERSION_LESS "3.1")
+  SET(CMAKE_C_FLAGS "-std=c99 ${CMAKE_C_FLAGS}")
+ELSE ()
+  SET(CMAKE_C_STANDARD 99)
+ENDIF ()
+
+IF (WITH_OPENMP)
+  FIND_PACKAGE(OpenMP)
+  IF(OPENMP_FOUND)
+    MESSAGE(STATUS "Compiling with OpenMP support")
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+  ENDIF(OPENMP_FOUND)
+ENDIF (WITH_OPENMP)
+
+SET(src init.c)
+ADD_LIBRARY(THNN SHARED init.c)
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+### Torch packages supposes libraries prefix is "lib"
+SET_TARGET_PROPERTIES(THNN PROPERTIES
+  PREFIX "lib"
+  IMPORT_PREFIX "lib")
+
+TARGET_LINK_LIBRARIES(THNN ${TH_LIBRARIES})
+INSTALL(TARGETS THNN DESTINATION ${RSPAMD_LIBDIR})
diff --git a/contrib/lua-torch/nn/lib/THNN/README.md b/contrib/lua-torch/nn/lib/THNN/README.md
new file mode 100644
index 000000000..e6c61601d
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/README.md
@@ -0,0 +1,32 @@
+# THNN
+
+THNN is a library that gathers nn's C implementations of neural network modules. It's entirely free of Lua dependency and therefore can be used in any application that has a C FFI. Please note that it only contains quite low level functions, and an object oriented C/C++ wrapper will be created soon as another library.
+
+There is also a CUDA counterpart of THNN (THCUNN) in the [cunn repository](https://github.com/torch/cunn/tree/master/lib/THCUNN).
+
+## Links
+
+* [API reference](doc/api_reference.md)
+* [Style guidelines](doc/style_guidelines.md)
+
+## Motivation
+
+Torch's neural network package (nn) provided many optimized C implementations of modules, but the source files contained Lua specific code and headers so they couldn't be easily compiled and included anywhere else.
+
+THNN is based on the same code, but is written in pure C, so it can be easily included in other code. **Future C implementations should be committed to THNN.**
+
+## API
+
+THNN is a purely functional library. It provides 2-3 functions for each module, that perform the most important operations:
+
+* **updateOutput** - applies the module to an input
+* **updateGradInput** - accepts gradient w.r.t. output and previous module input, and computes a gradient w.r.t. that input
+* **accGradParameters** - *(optional, only modules with parameters)* accepts gradient w.r.t. output and previous module input, and computes gradient w.r.t. the parameters
+
+For information on argument types please check the [API reference](doc/api_reference.md).
+
+## Developer docs
+
+* [Style guidelines](doc/style_guidelines.md)
+
+This section will be expanded when FFI refactoring will be finished.
diff --git a/contrib/lua-torch/nn/lib/THNN/THNN.h b/contrib/lua-torch/nn/lib/THNN/THNN.h
new file mode 100644
index 000000000..0019b7976
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/THNN.h
@@ -0,0 +1,33 @@
+#ifndef THNN_H
+#define THNN_H
+
+#include <stdbool.h>
+#include <TH.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#define THNN_(NAME) TH_CONCAT_3(THNN_, Real, NAME)
+
+#define THIndexTensor THLongTensor
+#define THIndexTensor_(NAME) THLongTensor_ ## NAME
+
+#define THIntegerTensor THIntTensor
+#define THIntegerTensor_(NAME) THIntTensor_ ## NAME
+
+typedef long THIndex_t;
+typedef int THInteger_t;
+typedef void THNNState;
+
+#define THNN_resizeAs_indices(I1, I2)                    \
+  THLongStorage *size2 = THIndexTensor_(newSizeOf)(I2);  \
+  if (!THTensor_(isSize)(I1, size2))                     \
+  { \
+    THTensor_(resize)(I1, size2, NULL);                  \
+  } \
+  THLongStorage_free(size2);
+
+#include "generic/THNN.h"
+#include <THGenerateFloatTypes.h>
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/Abs.c b/contrib/lua-torch/nn/lib/THNN/generic/Abs.c
new file mode 100644
index 000000000..28721ec8e
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/Abs.c
@@ -0,0 +1,28 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Abs.c"
+#else
+
+void THNN_(Abs_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  THTensor_(resizeAs)(output, input);
+  THTensor_(abs)(output, input);
+}
+
+void THNN_(Abs_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput)
+{
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+    real z = *input_data;
+    *gradInput_data = *gradOutput_data * (z >= 0 ? 1 : -1);
+  );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/AbsCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/AbsCriterion.c
new file mode 100644
index 000000000..9bee5de9e
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/AbsCriterion.c
@@ -0,0 +1,40 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/AbsCriterion.c"
+#else
+
+void THNN_(AbsCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage)
+{
+  real sum = 0;
+  THNN_CHECK_NELEMENT(input, target);
+  TH_TENSOR_APPLY2(real, input, real, target,
+    sum += fabs(*input_data - *target_data);
+  );
+
+  if (sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(AbsCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage)
+{
+  THNN_CHECK_NELEMENT(input, target);
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    *gradInput_data = (*input_data - *target_data) >= 0 ? norm : -norm;
+  );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/BCECriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/BCECriterion.c
new file mode 100644
index 000000000..637a4067e
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/BCECriterion.c
@@ -0,0 +1,66 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/BCECriterion.c"
+#else
+
+#define EPS 1e-12
+
+void THNN_(BCECriterion_updateOutput)(THNNState *state, THTensor *input,
+				      THTensor *target, THTensor *output,
+				      bool sizeAverage, THTensor *weights)
+{
+  THNN_CHECK_NELEMENT(input, target);
+  THNN_CHECK_NELEMENT(input, weights);
+  THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+  real sum = 0;
+
+  if(weights)
+    TH_TENSOR_APPLY3(real, input, real, target, real, weights,
+      real x = *input_data;
+      real y = *target_data;
+      real w = *weights_data;
+      THAssertMsg(x >= 0. && x <= 1.,
+        "input value should be between 0~1, but got %f",
+		  (double) x);
+      sum -= (log(x + EPS) * y + log(1. - x + EPS) * (1. - y)) * w;
+    )
+  else
+    TH_TENSOR_APPLY2(real, input, real, target,
+      real x = *input_data;
+      real y = *target_data;
+      THAssertMsg(x >= 0. && x <= 1.,
+        "input value should be between 0~1, but got %f",
+		  (double) x);
+      sum -= log(x + EPS) * y + log(1. - x + EPS) * (1. - y);
+    );
+
+
+  if (sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(BCECriterion_updateGradInput)(THNNState *state, THTensor *input,
+					 THTensor *target, THTensor *gradInput,
+					 bool sizeAverage, THTensor *weights)
+{
+  THNN_CHECK_NELEMENT(input, target);
+  THNN_CHECK_NELEMENT(input, weights);
+
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    real x = *input_data;
+    real y = *target_data;
+    *gradInput_data = - norm * (y - x) / ((1. - x + EPS) * (x + EPS));
+  );
+
+  if(weights)
+    THTensor_(cmul)(gradInput, gradInput, weights);
+}
+
+#undef EPS
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/BatchNormalization.c b/contrib/lua-torch/nn/lib/THNN/generic/BatchNormalization.c
new file mode 100644
index 000000000..b8f462790
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/BatchNormalization.c
@@ -0,0 +1,149 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/BatchNormalization.c"
+#else
+
+void THNN_(BatchNormalization_updateOutput)(
+  THNNState *state, THTensor *input, THTensor *output,
+  THTensor *weight, THTensor *bias,
+  THTensor *running_mean, THTensor *running_var,
+  THTensor *save_mean, THTensor *save_std,
+  bool train, double momentum, double eps)
+{
+  THTensor_(resizeAs)(output, input);
+  long nInput = THTensor_(size)(input, 1);
+  long f;
+  ptrdiff_t n = THTensor_(nElement)(input) / nInput;
+
+  #pragma omp parallel for
+  for (f = 0; f < nInput; ++f) {
+    THTensor *in = THTensor_(newSelect)(input, 1, f);
+    THTensor *out = THTensor_(newSelect)(output, 1, f);
+
+    real mean, invstd;
+
+    if (train) {
+      // compute mean per input
+      accreal sum = 0;
+      TH_TENSOR_APPLY(real, in, sum += *in_data;);
+
+      mean = (real) sum / n;
+      THTensor_(set1d)(save_mean, f, (real) mean);
+
+      // compute variance per input
+      sum = 0;
+      TH_TENSOR_APPLY(real, in,
+        sum += (*in_data - mean) * (*in_data - mean););
+
+      if (sum == 0 && eps == 0.0) {
+        invstd = 0;
+      } else {
+        invstd = (real) (1 / sqrt(sum/n + eps));
+      }
+      THTensor_(set1d)(save_std, f, (real) invstd);
+
+      // update running averages
+      THTensor_(set1d)(running_mean, f,
+        (real) (momentum * mean + (1 - momentum) * THTensor_(get1d)(running_mean, f)));
+
+      accreal unbiased_var = sum / (n - 1);
+      THTensor_(set1d)(running_var, f,
+        (real) (momentum * unbiased_var + (1 - momentum) * THTensor_(get1d)(running_var, f)));
+    } else {
+      mean = THTensor_(get1d)(running_mean, f);
+      invstd = 1 / sqrt(THTensor_(get1d)(running_var, f) + eps);
+    }
+
+    // compute output
+    real w = weight ? THTensor_(get1d)(weight, f) : 1;
+    real b = bias ? THTensor_(get1d)(bias, f) : 0;
+
+    TH_TENSOR_APPLY2(real, in, real, out,
+      *out_data = (real) (((*in_data - mean) * invstd) * w + b););
+
+    THTensor_(free)(out);
+    THTensor_(free)(in);
+  }
+}
+
+void THNN_(BatchNormalization_backward)(
+  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput,
+  THTensor *gradWeight, THTensor *gradBias, THTensor *weight,
+  THTensor *running_mean, THTensor *running_var,
+  THTensor *save_mean, THTensor *save_std,
+  bool train, double scale, double eps)
+{
+  THNN_CHECK_SHAPE(input, gradOutput);
+  long nInput = THTensor_(size)(input, 1);
+  long f;
+  ptrdiff_t n = THTensor_(nElement)(input) / nInput;
+
+  #pragma omp parallel for
+  for (f = 0; f < nInput; ++f) {
+    THTensor *in = THTensor_(newSelect)(input, 1, f);
+    THTensor *gradOut = THTensor_(newSelect)(gradOutput, 1, f);
+    real w = weight ? THTensor_(get1d)(weight, f) : 1;
+    real mean, invstd;
+    if (train) {
+      mean = THTensor_(get1d)(save_mean, f);
+      invstd = THTensor_(get1d)(save_std, f);
+    } else {
+      mean = THTensor_(get1d)(running_mean, f);
+      invstd = 1 / sqrt(THTensor_(get1d)(running_var, f) + eps);
+    }
+
+    // sum over all gradOutput in feature plane
+    accreal sum = 0;
+    TH_TENSOR_APPLY(real, gradOut, sum += *gradOut_data;);
+
+    // dot product of the Q(X) and gradOuput
+    accreal dotp = 0;
+    TH_TENSOR_APPLY2(real, in, real, gradOut,
+      dotp += (*in_data - mean) * (*gradOut_data););
+
+    if (gradInput) {
+      THTensor_(resizeAs)(gradInput, input);
+      THTensor *gradIn = THTensor_(newSelect)(gradInput, 1, f);
+
+      if (train) {
+        // when in training mode
+        // Q(X) = X - E[x] ; i.e. input centered to zero mean
+        // Y = Q(X) / σ    ; i.e. BN output before weight and bias
+        // dL/dX = (Q(dL/dY) - dot(Y, dL/dY) * Y) / σ * w
+
+        // projection of gradOutput on to output scaled by std
+        real k = (real) dotp * invstd * invstd / n;
+        TH_TENSOR_APPLY2(real, gradIn, real, in,
+          *gradIn_data = (*in_data - mean) * k;);
+
+        accreal gradMean = sum / n;
+        TH_TENSOR_APPLY2(real, gradIn, real, gradOut,
+          *gradIn_data = (*gradOut_data - gradMean - *gradIn_data) * invstd * w;);
+
+      } else {
+        // when in evaluation mode
+        // Q(X) = X - running_mean  ; i.e. input centered to zero mean
+        // Y = Q(X) / running_std    ; i.e. BN output before weight and bias
+        // dL/dX = w / running_std
+        TH_TENSOR_APPLY2(real, gradIn, real, gradOut,
+          *gradIn_data = *gradOut_data * invstd * w;);
+      }
+
+      THTensor_(free)(gradIn);
+    }
+
+    if (gradWeight) {
+      real val = THTensor_(get1d)(gradWeight, f);
+      THTensor_(set1d)(gradWeight, f, val + scale * dotp * invstd);
+    }
+
+    if (gradBias) {
+      real val = THTensor_(get1d)(gradBias, f);
+      THTensor_(set1d)(gradBias, f, val + scale * sum);
+    }
+
+    THTensor_(free)(gradOut);
+    THTensor_(free)(in);
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/ClassNLLCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/ClassNLLCriterion.c
new file mode 100644
index 000000000..4cf37aeaf
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/ClassNLLCriterion.c
@@ -0,0 +1,163 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/ClassNLLCriterion.c"
+#else
+
+void THNN_(ClassNLLCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          THTensor *weights,
+          THTensor *total_weight,
+          long ignore_index)
+{
+  THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+  THNN_CHECK_DIM_SIZE(total_weight, 1, 0, 1);
+  int n_dims = THTensor_(nDimension)(input);
+  int n_classes = THTensor_(size)(input, n_dims - 1);
+  ignore_index -= TH_INDEX_BASE;
+
+  if (THIndexTensor_(nDimension)(target) > 1) {
+    THError("multi-target not supported");
+  }
+  if (THTensor_(nDimension)(input) > 2) {
+    THError("input tensor should be 1D or 2D");
+  }
+  if (weights && THTensor_(nElement)(weights) != n_classes) {
+    THDescBuff s1 = THTensor_(sizeDesc)(weights);
+    THError("weight tensor should be defined either for all %d classes or no classes"
+	    " but got weight tensor of shape: %s", n_classes, s1.str);
+  }
+
+  input = THTensor_(newContiguous)(input);
+  target = THIndexTensor_(newContiguous)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+
+  real *input_data = THTensor_(data)(input);
+  THIndex_t *target_data = THIndexTensor_(data)(target);
+  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
+  real *output_data = THTensor_(data)(output);
+  real *total_weight_data = THTensor_(data)(total_weight);
+
+  output_data[0] = total_weight_data[0] = 0.0;
+
+  if (THTensor_(nDimension)(input) == 1) {
+    int cur_target = target_data[0] - TH_INDEX_BASE;
+    if (cur_target != ignore_index) {
+      THAssert(cur_target >= 0 && cur_target < n_classes);
+      total_weight_data[0] = weights ? weights_data[cur_target] : 1.0f;
+      output_data[0] = -input_data[cur_target] * total_weight_data[0];
+    }
+  } else if (THTensor_(nDimension)(input) == 2) {
+    int batch_size = THTensor_(size)(input, 0);
+    THAssert(THIndexTensor_(size)(target, 0) == batch_size);
+
+    int n_target = THTensor_(size)(input, 1);
+
+    int i;
+    for (i = 0; i < batch_size; i++) {
+      int cur_target = target_data[i] - TH_INDEX_BASE;
+      if (cur_target != ignore_index) {
+        THAssert(cur_target >= 0 && cur_target < n_classes);
+
+        real cur_weight = weights ? weights_data[cur_target] : 1.0f;
+        total_weight_data[0] += cur_weight;
+        output_data[0] -= input_data[i * n_target + cur_target] * cur_weight;
+      }
+    }
+  }
+
+  if (sizeAverage && total_weight_data[0]) {
+    output_data[0] /= total_weight_data[0];
+  }
+
+  if (weights) {
+    THTensor_(free)(weights);
+  }
+  THTensor_(free)(input);
+  THIndexTensor_(free)(target);
+}
+
+void THNN_(ClassNLLCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          THTensor *weights,
+          THTensor *total_weight,
+          long ignore_index)
+{
+  int n_dims = THTensor_(nDimension)(input);
+  int n_classes = THTensor_(size)(input, n_dims - 1);
+  ignore_index -= TH_INDEX_BASE;
+
+  if (!THTensor_(isContiguous)(gradInput)) {
+    THError("gradInput must be contiguous");
+  }
+
+  real *total_weight_data = THTensor_(data)(total_weight);
+
+  if (!(*total_weight_data > 0)) {
+    return;
+  }
+
+  if (THIndexTensor_(nDimension)(target) > 1) {
+    THError("multi-target not supported");
+  }
+
+  if (THTensor_(nDimension)(input) > 2) {
+    THError("input tensor should be 1D or 2D");
+  }
+
+  if (weights && THTensor_(nElement)(weights) != n_classes) {
+    THError("weight tensor should be defined either for all or no classes");
+  }
+
+  target = THIndexTensor_(newContiguous)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+
+  THIndex_t *target_data = THIndexTensor_(data)(target);
+  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
+  real *gradInput_data = THTensor_(data)(gradInput);
+
+  if (THTensor_(nDimension)(input) == 1) {
+    int cur_target = target_data[0] - TH_INDEX_BASE;
+    if (cur_target != ignore_index) {
+      THAssert(cur_target >= 0 && cur_target < n_classes);
+
+      gradInput_data[cur_target] =
+        (!sizeAverage && weights) ? -weights_data[cur_target] : -1;
+    }
+
+  } else if (THTensor_(nDimension)(input) == 2) {
+    int batch_size = THTensor_(size)(input, 0);
+    THAssert(THIndexTensor_(size)(target, 0) == batch_size);
+
+    int n_target = THTensor_(size)(input, 1);
+
+    int i;
+    for (i = 0; i < batch_size; i++){
+      int cur_target = target_data[i] - TH_INDEX_BASE;
+
+      if (cur_target != ignore_index) {
+        THAssert(cur_target >= 0 && cur_target < n_classes);
+
+        gradInput_data[i * n_target + cur_target] =
+          -(weights ? weights_data[cur_target] : 1.0f);
+
+        if (sizeAverage && *total_weight_data) {
+          gradInput_data[i * n_target + cur_target] /= *total_weight_data;
+        }
+      }
+    }
+  }
+
+  THIndexTensor_(free)(target);
+  if (weights) {
+    THTensor_(free)(weights);
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/DistKLDivCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/DistKLDivCriterion.c
new file mode 100644
index 000000000..6bd6aa067
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/DistKLDivCriterion.c
@@ -0,0 +1,44 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/DistKLDivCriterion.c"
+#else
+
+void THNN_(DistKLDivCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage)
+{
+  THNN_CHECK_NELEMENT(input, target);
+  THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+
+  real sum = 0;
+
+  TH_TENSOR_APPLY2(real, input, real, target,
+    sum += *target_data > 0 ? *target_data * (log(*target_data) - *input_data) : 0;
+  );
+
+  if (sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(DistKLDivCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage)
+{
+  THNN_CHECK_NELEMENT(input, target);
+
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    *gradInput_data = *target_data > 0 ? norm * (-*target_data) : 0;
+  );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/ELU.c b/contrib/lua-torch/nn/lib/THNN/generic/ELU.c
new file mode 100644
index 000000000..ddcfb9705
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/ELU.c
@@ -0,0 +1,54 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/ELU.c"
+#else
+
+void THNN_(ELU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal alpha_,
+          bool inplace)
+{
+  real alpha = TH_CONVERT_ACCREAL_TO_REAL(alpha_);
+  if(inplace) {
+    TH_TENSOR_APPLY(real, input,
+      if(*input_data <= 0) {
+        *input_data = (exp(*input_data) - 1) * alpha;
+      }
+    );
+    THTensor_(set)(output, input);
+  } else {
+    THTensor_(resizeAs)(output, input);
+    TH_TENSOR_APPLY2(real, input, real, output,
+      *output_data = *input_data <= 0 ? (exp(*input_data)-1)*alpha : *input_data;
+    );
+  }
+}
+
+void THNN_(ELU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output,
+          accreal alpha_,
+          bool inplace)
+{
+  real alpha = TH_CONVERT_ACCREAL_TO_REAL(alpha_);
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  if(inplace) {
+    TH_TENSOR_APPLY2(real, gradOutput, real, output,
+      if(*output_data <= 0) {
+        *gradOutput_data *= *output_data + alpha;
+      }
+    );
+    THTensor_(set)(gradInput, gradOutput);
+  } else {
+    THTensor_(resizeAs)(gradInput, output);
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+      *gradInput_data = *output_data <= 0 ? *gradOutput_data * (*output_data + alpha) : *gradOutput_data;
+    );
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/FusedRNNKernel.c b/contrib/lua-torch/nn/lib/THNN/generic/FusedRNNKernel.c
new file mode 100644
index 000000000..30788b0a2
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/FusedRNNKernel.c
@@ -0,0 +1,55 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/FusedRNNKernel.c"
+#else
+
+void THNN_(GRUFused_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *hidden,
+          THTensor *bias1,
+          THTensor *bias2,
+          THTensor *hx,
+          THTensor *hy,
+          THTensor *storage)
+{
+  THAssertMsg(false, "Not implemented for CPU");
+}
+
+void THNN_(GRUFused_updateGradInput)(
+          THNNState *state,
+          THTensor *gradInInput,
+          THTensor *gradInHidden,
+          THTensor *gradOutput,
+          THTensor *gradInputHx,
+          THTensor *storage)
+{
+  THAssertMsg(false, "Not implemented for CPU");
+}
+
+void THNN_(LSTMFused_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *hidden,
+          THTensor *bias1,
+          THTensor *bias2,
+          THTensor *cx,
+          THTensor *hy,
+          THTensor *cy)
+{
+  THAssertMsg(false, "Not implemented for CPU");
+}
+
+void THNN_(LSTMFused_updateGradInput)(
+          THNNState *state,
+          THTensor *storage,
+          THTensor *gradInGates,
+          THTensor *prevC,
+          THTensor *cy,
+          THTensor *gradOutput,
+          THTensor *gradOutputCell,
+          THTensor *gradInputCx)
+{
+  THAssertMsg(false, "Not implemented for CPU");
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/GatedLinearUnit.c b/contrib/lua-torch/nn/lib/THNN/generic/GatedLinearUnit.c
new file mode 100644
index 000000000..274a27e3b
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/GatedLinearUnit.c
@@ -0,0 +1,73 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/GatedLinearUnit.c"
+#else
+
+void THNN_(GatedLinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int dim)
+{
+  // size output to half of input
+  dim = dim - TH_INDEX_BASE;
+  const long nIn = THTensor_(size)(input, dim);
+  THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld",
+      dim + TH_INDEX_BASE, nIn);
+
+  const long inputSize = THTensor_(size)(input, dim) / 2;
+  THLongStorage *newSizes = THTensor_(newSizeOf)(input);
+  THLongStorage_set(newSizes, dim, inputSize);
+  THTensor_(resize)(output, newSizes, NULL);
+
+  // halve tensor
+  THTensor *firstHalf = THTensor_(newNarrow)(input, dim, 0, inputSize);
+  THTensor *secondHalf = THTensor_(newNarrow)(input, dim, inputSize, inputSize);
+
+  // x = x1:cmul( sigmoid(x2) )
+  THTensor_(sigmoid)(output, secondHalf);
+  THTensor_(cmul)(output, output, firstHalf);
+
+  THLongStorage_free(newSizes);
+  THTensor_(free)(firstHalf);
+  THTensor_(free)(secondHalf);
+}
+
+void THNN_(GatedLinear_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int dim)
+{
+  // set up tensors
+  dim = dim - TH_INDEX_BASE;
+  const long nIn = THTensor_(size)(input, dim);
+  THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld",
+      dim + TH_INDEX_BASE, nIn);
+
+  THTensor_(resizeAs)(gradInput, input);
+  const long inputSize = THTensor_(size)(input, dim) / 2;
+  THTensor *firstHalf = THTensor_(newNarrow)(input, dim, 0, inputSize);
+  THTensor *secondHalf = THTensor_(newNarrow)(input, dim, inputSize, inputSize);
+  THTensor *gradInputfirstHalf = THTensor_(newNarrow)(gradInput, dim, 0, inputSize);
+  THTensor *gradInputsecondHalf = THTensor_(newNarrow)(gradInput, dim, inputSize, inputSize);
+
+  THTensor_(sigmoid)(gradInputfirstHalf, secondHalf);
+
+  TH_TENSOR_APPLY2(real, gradInputsecondHalf, real, gradInputfirstHalf,
+    real z = *gradInputfirstHalf_data;
+    *gradInputsecondHalf_data = (1. - z) * z;
+  );
+
+  THTensor_(cmul)(gradInputfirstHalf, gradInputfirstHalf, gradOutput);
+
+  THTensor_(cmul)(gradInputsecondHalf, gradInputsecondHalf, gradOutput);
+  THTensor_(cmul)(gradInputsecondHalf, gradInputsecondHalf, firstHalf);
+
+  THTensor_(free)(firstHalf);
+  THTensor_(free)(secondHalf);
+  THTensor_(free)(gradInputfirstHalf);
+  THTensor_(free)(gradInputsecondHalf);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/HardShrink.c b/contrib/lua-torch/nn/lib/THNN/generic/HardShrink.c
new file mode 100644
index 000000000..aaae85bac
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/HardShrink.c
@@ -0,0 +1,42 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/HardShrink.c"
+#else
+
+void THNN_(HardShrink_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal lambda_)
+{
+  real lambda = TH_CONVERT_ACCREAL_TO_REAL(lambda_);
+  THTensor_(resizeAs)(output, input);
+
+  TH_TENSOR_APPLY2(real, output, real, input,
+    if (*input_data > lambda)
+      *output_data = *input_data;
+    else if (*input_data < -lambda)
+      *output_data = *input_data;
+    else
+      *output_data = 0;
+  );
+}
+
+void THNN_(HardShrink_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          accreal lambda_)
+{
+  real lambda = TH_CONVERT_ACCREAL_TO_REAL(lambda_);
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+    if (*input_data > lambda || *input_data < -lambda)
+      *gradInput_data = *gradOutput_data;
+    else
+      *gradInput_data = 0;
+  );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/HardTanh.c b/contrib/lua-torch/nn/lib/THNN/generic/HardTanh.c
new file mode 100644
index 000000000..589a66e15
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/HardTanh.c
@@ -0,0 +1,133 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/HardTanh.c"
+#else
+
+void THNN_(HardTanh_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal min_val_,
+          accreal max_val_,
+          bool inplace)
+{
+  real min_val = TH_CONVERT_ACCREAL_TO_REAL(min_val_);
+  real max_val = TH_CONVERT_ACCREAL_TO_REAL(max_val_);
+  if (inplace)
+    THTensor_(set)(output, input);
+  else
+    THTensor_(resizeAs)(output, input);
+
+  if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output))
+  {
+    if (inplace)
+      TH_TENSOR_APPLY(real, input,
+        if (*input_data < min_val)
+          *input_data = min_val;
+        else if (*input_data > max_val)
+          *input_data = max_val;
+      );
+      TH_TENSOR_APPLY2(real, output, real, input,
+        if (*input_data < min_val)
+          *output_data = min_val;
+        else if (*input_data <= max_val)
+          *output_data = *input_data;
+        else
+          *output_data = max_val;
+      );
+  }
+  else
+  {
+    real* ptr_input  = THTensor_(data)(input);
+    real* ptr_output = THTensor_(data)(output);
+    ptrdiff_t i;
+    ptrdiff_t n = THTensor_(nElement)(input);
+
+    if (inplace)
+#pragma omp parallel for private(i)
+      for (i = 0; i < n; i++)
+      {
+        if (ptr_input[i] < min_val)
+          ptr_input[i] = min_val;
+        else if (ptr_input[i] > max_val)
+          ptr_input[i] = max_val;
+      }
+    else
+#pragma omp parallel for private(i)
+      for (i = 0; i < n; i++)
+      {
+        if (ptr_input[i] < min_val)
+          ptr_output[i] = min_val;
+        else if (ptr_input[i] <= max_val)
+          ptr_output[i] = ptr_input[i];
+        else
+          ptr_output[i] = max_val;
+      }
+  }
+}
+
+void THNN_(HardTanh_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          accreal min_val_,
+          accreal max_val_,
+          bool inplace)
+{
+  real min_val = TH_CONVERT_ACCREAL_TO_REAL(min_val_);
+  real max_val = TH_CONVERT_ACCREAL_TO_REAL(max_val_);
+
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  if (inplace)
+    THTensor_(set)(gradInput, gradOutput);
+  else
+    THTensor_(resizeAs)(gradInput, input);
+
+  if (input->nDimension == 1 ||
+    !THTensor_(isContiguous)(input) ||
+    !THTensor_(isContiguous)(gradOutput) ||
+    !THTensor_(isContiguous)(gradInput))
+  {
+    if (inplace)
+    {
+      TH_TENSOR_APPLY2(real, gradOutput, real, input,
+        if (*input_data <= min_val || *input_data >= max_val)
+          *gradOutput_data = 0;
+      );
+    }
+    else
+      TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+        if (*input_data <= min_val || *input_data >= max_val)
+          *gradInput_data = 0;
+        else
+          *gradInput_data = *gradOutput_data;
+      );
+  }
+  else
+  {
+    real* ptr_gradOutput = THTensor_(data)(gradOutput);
+    real* ptr_gradInput  = THTensor_(data)(gradInput);
+    real* ptr_input      = THTensor_(data)(input);
+    ptrdiff_t i;
+    ptrdiff_t n = THTensor_(nElement)(input);
+
+    if (inplace)
+#pragma omp parallel for private(i)
+      for (i = 0; i < n; i++)
+      {
+        if (ptr_input[i] <= min_val || ptr_input[i] >= max_val)
+          ptr_gradInput[i] = 0;
+      }
+    else
+#pragma omp parallel for private(i)
+      for (i = 0; i < n; i++)
+      {
+        if (ptr_input[i] <= min_val || ptr_input[i] >= max_val)
+          ptr_gradInput[i] = 0;
+        else
+          ptr_gradInput[i] = ptr_gradOutput[i];
+      }
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/IndexLinear.c b/contrib/lua-torch/nn/lib/THNN/generic/IndexLinear.c
new file mode 100644
index 000000000..42d8368ba
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/IndexLinear.c
@@ -0,0 +1,742 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/IndexLinear.c"
+#else
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+/* Threshold used to trigger multithreading */
+#ifndef THNN_SPARSE_OMP_THRESHOLD
+#define THNN_SPARSE_OMP_THRESHOLD 100000
+#endif
+
+/* Threshold used to trigger BLAS axpy call */
+#ifndef THNN_SPARSE_OUTDIM_THRESHOLD
+#define THNN_SPARSE_OUTDIM_THRESHOLD 49
+#endif
+
+/* sign MACRO */
+#ifndef THNN_INDEXLINEAR_SIGN
+#define THNN_INDEXLINEAR_SIGN(a) ( ( (a) < 0 )  ?  -1   : ( (a) > 0 ) )
+#endif
+
+static bool THNN_(checkKeysValues)(THLongTensor* keys, THTensor* values)
+{
+  return THLongTensor_size(keys, 0) == THTensor_(nElement)(values)
+                && THTensor_(nDimension)(values) == 1
+                && THLongTensor_nDimension(keys) == 1;
+}
+
+void THNN_(IndexLinear_updateOutput)(
+          THNNState *state,
+          THLongTensor *keys,
+          long keysOffset,
+          THTensor *values,
+          THLongTensor *sizes,
+          THLongTensor *cumSumSizes,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *normalizedValues,
+          int  train)
+{
+  /* Retrieve all the dimensions of the problem */
+  long batchSize = THLongTensor_size(sizes, 0);
+  long keysSize = THLongTensor_size(keys, 0);
+  long outDim = THTensor_(size)(bias, 0);
+  long woutDim = THTensor_(size)(weight, 1);
+  int maxNormalize = woutDim - outDim;
+  long* sizesData = THLongTensor_data(sizes);
+  long* cumSumSizesData = THLongTensor_data(cumSumSizes);
+
+  /* Define/resize the normalized values tensor if maxNormalize is  > 0 */
+  real* normalizedValuesData = NULL;
+  if (maxNormalize)
+  {
+    THTensor_(resize1d)(normalizedValues, keysSize);
+    normalizedValuesData = THTensor_(data)(normalizedValues);
+  }
+
+  /* Resize the output */
+  THTensor_(resize2d)(output, batchSize, outDim);
+
+  /* Access the storage data/strides */
+  real* outputData = THTensor_(data)(output);
+  real* valuesData = THTensor_(data)(values);
+  real* weightData = THTensor_(data)(weight);
+  long weightStride0 = weight->stride[0];
+  real* biasData = THTensor_(data)(bias);
+  long* keysData = THLongTensor_data(keys);
+
+  /* Make sure these inputs are contiguous to accelerate computations */
+  THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(output), 6, "output vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(weight), 7, "weight matrix must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(bias), 8, "bias vector must be contiguous");
+  THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements");
+  THArgCheck(THTensor_(isContiguous)(normalizedValues), 9, "normalizedValues vector must be contiguous");
+  long i,j,k;
+
+  /* Separate cases: output dimension is == 1, or > 1
+   * This allows for some optimizations. */
+  if (outDim == 1)
+  {
+    THVector_(fill)(outputData, *biasData, batchSize);
+    if (maxNormalize)
+    {
+      /* Parallelize on the batch itself */
+#pragma omp parallel                                                    \
+    for private(i,j)                                                    \
+    firstprivate(outDim, keysOffset,                                    \
+                 weightData, keysData,                                  \
+                 valuesData, outputData,                                \
+                 cumSumSizesData, sizesData)                            \
+    schedule(static)                                                    \
+    if(keysSize*outDim > THNN_SPARSE_OMP_THRESHOLD && batchSize > 1)
+      for (j = 0; j < batchSize; j++)
+      {
+        real* loutputData = outputData + j;
+        real val = 0;
+        real absVal = 0;
+        long offset = j == 0 ? 0 : cumSumSizesData[j - 1];
+
+        for (i = 0; i < sizesData[j]; i++)
+        {
+          long woffset = weightStride0*(keysData[offset] + keysOffset);
+          absVal = fabs(valuesData[offset]);
+          if (train)
+          {
+            if (absVal > weightData[woffset])
+            {
+              weightData[woffset] = absVal;
+              weightData[woffset+1] = 1/absVal;
+            }
+
+            /*
+             * The following can be used to scale the size of the updates
+             * depending on some rule, e.g. the frequency of a feature, ...
+             * This is used at update time.
+             * TODO: implement a smarter update scale.
+             */
+            weightData[woffset+2] = 1;
+          }
+          normalizedValuesData[offset] = (absVal > weightData[woffset] ? THNN_INDEXLINEAR_SIGN(valuesData[offset]):valuesData[offset]*weightData[woffset+1]) + weightData[woffset+3];
+          val += normalizedValuesData[offset] * weightData[woffset+maxNormalize];
+          offset++;
+        }
+        *loutputData += val;
+      }
+    }
+    else
+    {
+      /* Parallelize on the batch itself */
+#pragma omp parallel                                                    \
+    for private(i,j)                                                    \
+    firstprivate(outDim, weightData,                                    \
+                 keysData, valuesData,                                  \
+                 outputData, cumSumSizesData,                           \
+                 sizesData)                                             \
+    schedule(static)                                                    \
+    if(keysSize*outDim > THNN_SPARSE_OMP_THRESHOLD && batchSize > 1)
+      for (j = 0; j < batchSize; j++)
+      {
+        long offset = j == 0 ? 0 : cumSumSizesData[j - 1];
+        real* loutputData = outputData + j;
+        real val = 0;
+
+        for (i = 0; i < sizesData[j]; i++)
+        {
+          val += weightData[weightStride0*(keysData[offset] + keysOffset)] * valuesData[offset];
+          offset++;
+        }
+        *loutputData += val;
+      }
+    }
+  }
+  else {
+#pragma omp parallel                                                    \
+    for private(i,j,k)                                                  \
+    firstprivate(outDim, weightData,                                    \
+                 keysData, valuesData,                                  \
+                 biasData, outputData,                                  \
+                 cumSumSizesData, sizesData)                            \
+    schedule(static)                                                    \
+    if(keysSize*outDim > THNN_SPARSE_OMP_THRESHOLD && batchSize > 1)
+    for (j = 0; j < batchSize; j++)
+    {
+      long offset = j == 0 ? 0 : cumSumSizesData[j -  1];
+      real val = 0;
+      real* loutputData = outputData + j*outDim;
+      real* lweightData = weightData;
+      memcpy(loutputData, biasData, outDim*sizeof(real));
+      for (i = 0; i < sizesData[j]; i++)
+      {
+        real val;
+        long woffset = weightStride0*(keysData[offset] + keysOffset);
+        if (maxNormalize)
+        {
+          val = valuesData[offset];
+          real absVal = fabs(val);
+          if (train)
+          {
+            if (absVal > weightData[woffset])
+            {
+              weightData[woffset] = absVal;
+              weightData[woffset+1] = 1/absVal;
+            }
+
+            /*
+             * The following can be used to scale the size of the updates
+             * depending on some rule, e.g. the frequency of a feature, ...
+             * The commented section thereafter is just an example of what can be done:
+             *
+             *```
+             * weightData[woffset+2] = weightData[woffset+2]==0?1:(weightData[woffset+2] / (weightData[woffset+2] + 1));
+             * real alpha = 1;
+             * real beta = 0.01;
+             * real gamma = 1 - 0.000001;
+             * real l = weightData[woffset+2]==0?1/gamma:(weightData[woffset+2] - beta) / (alpha - beta);
+             * l = gamma*l;
+             * weightData[woffset+2] = (alpha-beta)*l + beta;
+             * ```
+             *
+             * TODO: implement a smarter update scale.
+             */
+            weightData[woffset+2] = 1;
+          }
+
+          /* Normalize + Clamp */
+          val = (absVal > weightData[woffset] ? THNN_INDEXLINEAR_SIGN(val):val*weightData[woffset+1]) + weightData[woffset+3];
+          normalizedValuesData[offset] = val;
+
+          lweightData = weightData + woffset + maxNormalize;
+        }
+        else
+        {
+          val = valuesData[offset];
+          lweightData = weightData + woffset;
+        }
+        if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
+        {
+          THBlas_(axpy)(outDim, val, lweightData, 1, loutputData, 1);
+        }
+        else
+        {
+          for (k=0; k < outDim; k++)
+          {
+            loutputData[k] += lweightData[k] * val;
+          }
+        }
+        offset++;
+      }
+    }
+  }
+  return;
+}
+
+void THNN_(IndexLinear_updateParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          THLongTensor *runningKeys,
+          THLongTensor *cumSumSizes,
+          long keysOffset,
+          accreal weightDecay_,
+          accreal learningRate_)
+{
+  real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
+  real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_);
+  /* Retrieve all the dimensions of the problem */
+  long outDim = THTensor_(size)(bias, 0);
+  long woutDim = THTensor_(size)(weight, 1);
+  int maxNormalize = woutDim - outDim;
+  long keysSize = THLongTensor_size(runningKeys, 0);
+
+  /* Access the storage data/strides */
+  real* gradWeightData = THTensor_(data)(gradWeight);
+  real* weightData = THTensor_(data)(weight);
+  long weightStride0 = weight->stride[0];
+  real* gradBiasData = THTensor_(data)(gradBias);
+  real* biasData = THTensor_(data)(bias);
+  long* keysData = THLongTensor_data(runningKeys);
+
+  /* Make sure these inputs are contiguous to accelerate computations */
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 1, "gradWeight must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradBias), 2, "gradBias vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(weight), 3, "gradBias vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(bias), 4, "gradBias vector must be contiguous");
+  THArgCheck(THLongTensor_isContiguous(runningKeys), 5, "keys vector must be contiguous");
+
+  int j,k;
+  long offset = 0;
+
+  /* Update the bias first */
+  THVector_(cadd)(biasData, biasData, gradBiasData, -learningRate, outDim);
+
+  /* Separate cases: output dimension is == 1, or > 1
+   * This allows for some optimizations.
+   * No multithreading here as this could
+   * corrupt the results (hogwild style) */
+  if (outDim == 1)
+  {
+    if (maxNormalize)
+    {
+      if (weightDecay)
+      {
+        for (j = 0; j < keysSize; j++)
+        {
+          long woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize;
+          real lr = learningRate*weightData[woffset-2];
+          weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr;
+          weightData[woffset] -= gradWeightData[2*j+1]*lr - weightDecay * weightData[woffset-2] * weightData[woffset];
+        }
+      }
+      else
+      {
+        for (j = 0; j < keysSize; j++)
+        {
+          long woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize;
+          real lr = learningRate*weightData[woffset-2];
+          weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr;
+          weightData[woffset] -= gradWeightData[2*j+1]*lr;
+        }
+      }
+    }
+    else
+    {
+      if (weightDecay)
+      {
+        for (j = 0; j < keysSize; j++)
+        {
+          long woffset = weightStride0*(keysData[j] + keysOffset);
+          weightData[woffset] -= gradWeightData[j]*learningRate + weightDecay * weightData[woffset];
+        }
+      }
+      else
+      {
+        for (j = 0; j < keysSize; j++)
+        {
+          weightData[weightStride0*(keysData[j] + keysOffset)] -= gradWeightData[j]*learningRate;
+        }
+      }
+    }
+  }
+  else
+  {
+    for (j = 0; j < keysSize; j++)
+    {
+      real lr = learningRate;
+      real wd = weightDecay;
+      real* lweightData;
+      long woffset = weightStride0*(keysData[j] + keysOffset);
+      real* lgradWeightData = gradWeightData + j*outDim;
+      if (maxNormalize)
+      {
+        lgradWeightData += j*outDim;
+        /* weightData[woffset + 2] */
+        lweightData = weightData + woffset + maxNormalize - 2;
+        lr = lr*lweightData[0];
+        wd = weightDecay*lweightData[0];
+        /* weightData[woffset + 3] */
+        lweightData++;
+        for (k=0; k < outDim; k++)
+        {
+            lweightData[0] -= lgradWeightData[k]*lweightData[k+1]*lr;
+        }
+        lweightData++;
+        lgradWeightData += outDim;
+      }
+      else
+      {
+        lweightData = weightData + woffset;
+      }
+
+      /* We do sparse weight decay.
+       * We think it makes more sense. */
+      if (weightDecay)
+      {
+        for (k=0; k < outDim; k++)
+        {
+            lweightData[k] -= lweightData[k]*wd;
+        }
+      }
+
+      if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
+      {
+        THBlas_(axpy)(outDim, -lr, lgradWeightData, 1, lweightData, 1);
+      }
+      else
+      {
+        for (k=0; k < outDim; k++)
+        {
+          lweightData[k] -= lgradWeightData[k]*lr;
+        }
+      }
+    }
+  }
+}
+
+
+void THNN_(IndexLinear_accUpdateGradParameters)(
+          THNNState *state,
+          THLongTensor *keys,
+          long keysOffset,
+          THTensor *values,
+          THLongTensor *sizes,
+          THLongTensor *cumSumSizes,
+          THTensor *gradOutput,
+          THTensor *weight,
+          THTensor *bias,
+          accreal weightDecay_,
+          accreal scale_)
+{
+  real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  /* Retrieve all the dimensions of the problem */
+  long batchSize = THLongTensor_size(sizes, 0);
+  long keysSize = THLongTensor_size(keys, 0);
+  long outDim = THTensor_(size)(bias, 0);
+  long woutDim = THTensor_(size)(weight, 1);
+  int maxNormalize = woutDim - outDim;
+  THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements");
+
+  /* Access the storage data/strides */
+  real* gradOutputData = THTensor_(data)(gradOutput);
+  real* valuesData =THTensor_(data)(values);
+  real* weightData = THTensor_(data)(weight);
+  real* biasData = THTensor_(data)(bias);
+  long weightStride0 = weight->stride[0];
+  long biasStride = bias->stride[0];
+  long* keysData = THLongTensor_data(keys);
+  long* sizesData = THLongTensor_data(sizes);
+
+  /* Make sure these inputs are contiguous to accelerate computations */
+  THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(weight), 7, "weight matrix must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(bias), 8, "bias matrix must be contiguous");
+
+  int i,j,k;
+
+  /* Separate cases: output dimension is == 1, or > 1
+   * This allows for some optimizations.
+   * No multithreading here as this could
+   * corrupt the results (hogwild style) */
+  if (outDim == 1)
+  {
+    if (maxNormalize)
+    {
+        long offset = 0;
+        for (j = 0; j < batchSize; j++)
+        {
+          real* lgradOutputData = gradOutputData + j;
+          *biasData -= *lgradOutputData * scale;
+          real val = *lgradOutputData * scale;
+          real* lweightData = weightData;
+          for (i = 0; i < sizesData[j]; i++)
+          {
+            long idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize;
+            weightData[idx-1] -= weightData[idx]*val*weightData[idx-2];
+            weightData[idx] -= (val*valuesData[offset] - weightDecay * weightData[idx])*weightData[idx-2];
+            offset++;
+          }
+        }
+
+        offset = 0;
+        for (j = 0; j < batchSize; j++)
+        {
+          real* lweightData = weightData;
+          for (i = 0; i < sizesData[j]; i++)
+          {
+            long idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize;
+            weightData[idx-2] = 0;
+            offset++;
+          }
+        }
+    }
+    else
+    {
+      if (weightDecay)
+      {
+        long offset = 0;
+        for (j = 0; j < batchSize; j++)
+        {
+          real* lgradOutputData = gradOutputData + j;
+          *biasData -= *lgradOutputData * scale;
+          real val = *lgradOutputData * scale;
+          real* lweightData = weightData;
+          for (i = 0; i < sizesData[j]; i++)
+          {
+            long idx = weightStride0*(keysData[offset] + keysOffset);
+            weightData[idx] -= val * valuesData[offset] + weightData[idx] * weightDecay;
+            offset++;
+          }
+        }
+      }
+      else
+      {
+        long offset = 0;
+        for (j = 0; j < batchSize; j++)
+        {
+          real val = gradOutputData[j] * scale;
+          for (i = 0; i < sizesData[j]; i++)
+          {
+            weightData[(keysData[offset] + keysOffset)*weightStride0] -= val * valuesData[offset];
+            offset++;
+          }
+          *biasData -= val;
+        }
+      }
+    }
+  }
+  else {
+    long offset = 0;
+    for (j = 0; j < batchSize; j++)
+    {
+      real val = 0;
+      real* lgradOutputData = gradOutputData + j*outDim;
+      real* lweightData = weightData;
+      THVector_(cadd)(biasData, biasData, lgradOutputData, -scale, outDim);
+      for (i = 0; i < sizesData[j]; i++)
+      {
+        real val = valuesData[offset] * scale;
+        real wd = weightDecay;
+
+        // Max normalize case
+        if (maxNormalize)
+        {
+          lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2);
+          val *= lweightData[0];
+          wd *= lweightData[0];
+          for (k=0; k < outDim; k++)
+          {
+            lweightData[1] -= lweightData[k+2]*scale*lgradOutputData[k]*lweightData[0];
+          }
+          lweightData += 2;
+        }
+        else
+        {
+          lweightData = weightData + weightStride0*(keysData[offset] + keysOffset);
+        }
+
+        /* We do sparse weight decay.
+         * We think it makes more sense. */
+        if (weightDecay)
+        {
+          if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
+          {
+            THBlas_(axpy)(outDim, -wd, lweightData, 1, lweightData, 1);
+          }
+          else
+          {
+            for (k=0; k < outDim; k++)
+            {
+              lweightData[k] -= wd * lweightData[k];
+            }
+          }
+        }
+
+        if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
+        {
+          THBlas_(axpy)(outDim, -val, lgradOutputData, 1, lweightData, 1);
+        }
+        else
+        {
+          for (k=0; k < outDim; k++)
+          {
+            lweightData[k] -= val * lgradOutputData[k];
+          }
+        }
+        offset++;
+      }
+    }
+
+    /* Max Normalize case:
+     * Reset the smart update scaling if
+     * one does it batch-wise.
+     * TODO: Decide what to do with that piece of code.
+     * NB: If the code belowe is uncommented, so should the commented
+     * code in IndexLinear:zeroGradParameters() */
+
+    /*
+    if (maxNormalize)
+    {
+      offset = 0;
+      for (j = 0; j < batchSize; j++)
+      {
+        real* lweightData = weightData;
+        for (i = 0; i < sizesData[j]; i++)
+        {
+          real val = valuesData[offset] * scale;
+          real wd = weightDecay;
+
+          lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2);
+          lweightData[0] = 0;
+          offset++;
+        }
+      }
+    }
+    */
+  }
+  return;
+}
+
+void THNN_(IndexLinear_accGradParameters)(
+          THNNState *state,
+          THLongTensor *keys,
+          long keysOffset,
+          THTensor *values,
+          THLongTensor *sizes,
+          THLongTensor *cumSumSizes,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *valuesBuffer,
+          accreal weightDecay_,
+          accreal scale_)
+{
+  real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  /* Retrieve all the dimensions of the problem */
+  long batchSize = THLongTensor_size(sizes, 0);
+  long keysSize = THLongTensor_size(keys, 0);
+  long outDim = THTensor_(size)(bias, 0);
+  long woutDim = THTensor_(size)(weight, 1);
+  long maxNormalize = (woutDim - outDim) > 0 ?1:0;
+  THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements");
+  long* sizesData = THLongTensor_data(sizes);
+
+  /* COmpute the cumulative sizes */
+  THLongTensor* cumSizes = THLongTensor_new();
+  THLongTensor_cumsum(cumSizes, sizes, 0);
+  long* cumSizesData = THLongTensor_data(cumSizes);
+
+  /* Resize the gradWeight buffer to keep it dense.
+   * That speeds up updates A LOT assuming random mem access. */
+  THTensor_(resize2d)(gradWeight, keysSize, outDim * (maxNormalize>0?2:1));
+
+  /* Access the storage data/strides */
+  real* gradOutputData = THTensor_(data)(gradOutput);
+  real* valuesData =THTensor_(data)(values);
+  real* gradWeightData = THTensor_(data)(gradWeight);
+  real* weightData = THTensor_(data)(weight);
+  real* gradBiasData = THTensor_(data)(gradBias);
+  long gradWeightStride0 = gradWeight->stride[0];
+  long weightStride0 = weight->stride[0];
+  long* keysData = THLongTensor_data(keys);
+
+  /* Make sure these inputs are contiguous to accelerate computations */
+  THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 7, "gradWeight must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradBias), 8, "gradBias vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(weight), 9, "weight must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(bias), 10, "bias vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(valuesBuffer), 11, "valuesBuffer must be contiguous");
+
+  int i,j,k;
+
+  /* Separate cases: output dimension is == 1, or > 1
+   * This allows for some optimizations.
+   * No multithreading here as this could
+   * corrupt the results (hogwild style) */
+  if (outDim == 1)
+  {
+    for (j = 0; j < batchSize; j++)
+    {
+      long offset = j==0?0:cumSizesData[j-1];
+      real val = gradOutputData[j] * scale;
+      real* lgradWeightData = gradWeightData + offset;
+      real* lvaluesData = valuesData + offset;
+      long end = sizesData[j];
+
+      if (maxNormalize)
+      {
+        lgradWeightData += offset;
+        i = 0;
+        for(;i < end; i++)
+        {
+          lgradWeightData[2*i] = val;
+          lgradWeightData[2*i+1] = val * lvaluesData[i];
+        }
+      }
+      else
+      {
+        i = 0;
+        for(;i < end-4; i += 4)
+        {
+          lgradWeightData[i] = val * lvaluesData[i];
+          lgradWeightData[i+1] = val * lvaluesData[i+1];
+          lgradWeightData[i+2] = val * lvaluesData[i+2];
+          lgradWeightData[i+3] = val * lvaluesData[i+3];
+        }
+
+        for(; i < end; i++)
+        {
+          lgradWeightData[i] = val * lvaluesData[i];
+        }
+      }
+      *gradBiasData += val;
+      offset += end;
+    }
+  }
+  else {
+    for (j = 0; j < batchSize; j++)
+    {
+      long offset = j==0?0:cumSizesData[j-1];
+      real val = 0;
+      real* lgradOutputData = gradOutputData + j*outDim;
+      real* lgradWeightData = gradWeightData;
+      real* lweightData = weightData;
+      THVector_(cadd)(gradBiasData, gradBiasData, lgradOutputData, scale, outDim);
+      for (i = 0; i < sizesData[j]; i++)
+      {
+        real val = valuesData[offset] * scale;
+        lgradWeightData = gradWeightData + offset*outDim;
+        if (maxNormalize)
+        {
+          lgradWeightData += offset*outDim;
+          k = 0;
+          for(;k < outDim-4; k += 4)
+          {
+            lgradWeightData[k] = lgradOutputData[k]*scale;
+            lgradWeightData[k+1] = lgradOutputData[k+1]*scale;
+            lgradWeightData[k+2] = lgradOutputData[k+2]*scale;
+            lgradWeightData[k+3] = lgradOutputData[k+3]*scale;
+          }
+
+          for(; k < outDim; k++)
+          {
+            lgradWeightData[k] = lgradOutputData[k]*scale;
+          }
+          lgradWeightData += outDim;
+        }
+        k = 0;
+        for(;k < outDim-4; k += 4)
+        {
+          lgradWeightData[k] = val * lgradOutputData[k];
+          lgradWeightData[k+1] = val * lgradOutputData[k+1];
+          lgradWeightData[k+2] = val * lgradOutputData[k+2];
+          lgradWeightData[k+3] = val * lgradOutputData[k+3];
+        }
+
+        for(; k < outDim; k++)
+        {
+          lgradWeightData[k] = val * lgradOutputData[k];
+        }
+        offset++;
+      }
+    }
+  }
+  THLongTensor_free(cumSizes);
+  return;
+}
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/L1Cost.c b/contrib/lua-torch/nn/lib/THNN/generic/L1Cost.c
new file mode 100644
index 000000000..53940e894
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/L1Cost.c
@@ -0,0 +1,38 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/L1Cost.c"
+#else
+
+void THNN_(L1Cost_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+  accreal sum = 0;
+
+  TH_TENSOR_APPLY(real, input,
+    sum += fabs(*input_data);
+  );
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(L1Cost_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput)
+{
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY2(real, gradInput, real, input,
+    if (*input_data > 0)
+      *gradInput_data = 1;
+    else if (*input_data < 0)
+      *gradInput_data = -1;
+    else
+      *gradInput_data = 0;
+  );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/LeakyReLU.c b/contrib/lua-torch/nn/lib/THNN/generic/LeakyReLU.c
new file mode 100644
index 000000000..074047d83
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/LeakyReLU.c
@@ -0,0 +1,57 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LeakyReLU.c"
+#else
+
+void THNN_(LeakyReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal negval_,
+          bool inplace)
+{
+  real negval = TH_CONVERT_ACCREAL_TO_REAL(negval_);
+  if (inplace)
+  {
+    TH_TENSOR_APPLY(real, input,
+      if (*input_data <= 0)
+        *input_data *= negval;
+    );
+    THTensor_(set)(output, input);
+  }
+  else
+  {
+    THTensor_(resizeAs)(output, input);
+    TH_TENSOR_APPLY2(real, output, real, input,
+      *output_data = *input_data > 0 ? *input_data : *input_data * negval;
+    );
+  }
+}
+
+void THNN_(LeakyReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          accreal negval_,
+          bool inplace)
+{
+  real negval = TH_CONVERT_ACCREAL_TO_REAL(negval_);
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  if (inplace)
+  {
+    TH_TENSOR_APPLY2(real, gradOutput, real, input,
+      if (*input_data <= 0)
+        *gradOutput_data *= negval;
+    );
+    THTensor_(set)(gradInput, gradOutput);
+  }
+  else
+  {
+    THTensor_(resizeAs)(gradInput, input);
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+      *gradInput_data = *input_data > 0 ? *gradOutput_data : *gradOutput_data * negval;
+    );
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/Linear.c b/contrib/lua-torch/nn/lib/THNN/generic/Linear.c
new file mode 100644
index 000000000..8c5cd115e
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/Linear.c
@@ -0,0 +1,114 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Linear.c"
+#else
+
+void THNN_(Linear_updateAddBuffer)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *addBuffer)
+{
+  long nframe = THTensor_(size)(input,0);
+  long nElement = THTensor_(nElement)(addBuffer);
+  if (nElement != nframe) {
+    THTensor_(resize1d)(addBuffer,nframe);
+    THTensor_(fill)(addBuffer,1.0);
+  }
+}
+
+void THNN_(Linear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *addBuffer)
+{
+  long dim = THTensor_(nDimension)(input);
+  if (dim == 1) {
+    THTensor_(resize1d)(output,THTensor_(size)(weight,0));
+    if (bias) {
+      THTensor_(copy)(output,bias);
+    }
+    else {
+      THTensor_(zero)(output);
+    }
+    THTensor_(addmv)(output,1,output,1,weight,input);
+  }
+  else if (dim == 2) {
+    long nframe = THTensor_(size)(input,0);
+    long nElement = THTensor_(nElement)(output);
+    THTensor_(resize2d)(output,nframe,THTensor_(size)(weight,0));
+    if (THTensor_(nElement)(output) != nElement) {
+      THTensor_(zero)(output);
+    }
+    THNN_(Linear_updateAddBuffer)(state,input,addBuffer);
+    THTensor *tweight = THTensor_(new)();
+    THTensor_(transpose)(tweight,weight,0,1);
+    THTensor_(addmm)(output,0,output,1,input,tweight);
+    THTensor_(free)(tweight);
+    if (bias) {
+      THTensor_(addr)(output,1,output,1,addBuffer,bias);
+    }
+  }
+}
+
+void THNN_(Linear_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight)
+{
+  if (gradInput) {
+    long nElement = THTensor_(nElement)(gradInput);
+    THTensor_(resizeAs)(gradInput,input);
+    if (THTensor_(nElement)(gradInput) != nElement) {
+      THTensor_(zero)(gradInput);
+    }
+
+    long dim = THTensor_(nDimension)(input);
+    if (dim == 1) {
+      THTensor *tweight = THTensor_(new)();
+      THTensor_(transpose)(tweight,weight,0,1);
+      THTensor_(addmv)(gradInput,0,gradInput,1,tweight,gradOutput);
+      THTensor_(free)(tweight);
+    }
+    else if (dim == 2) {
+      THTensor_(addmm)(gradInput,0,gradInput,1,gradOutput,weight);
+    }
+  }
+}
+
+void THNN_(Linear_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *addBuffer,
+          accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  long dim = THTensor_(nDimension)(input);
+  if (dim == 1) {
+    THTensor_(addr)(gradWeight,1,gradWeight,scale,gradOutput,input);
+    if (bias) {
+      THTensor_(cadd)(gradBias,gradBias,scale,gradOutput);
+    }
+  }
+  else if (dim == 2) {
+    THTensor *tgradOutput = THTensor_(new)();
+    THTensor_(transpose)(tgradOutput,gradOutput,0,1);
+    THTensor_(addmm)(gradWeight,1,gradWeight,scale,tgradOutput,input);
+    if (bias) {
+      THNN_(Linear_updateAddBuffer)(state,input,addBuffer);
+      THTensor_(addmv)(gradBias,1,gradBias,scale,tgradOutput,addBuffer);
+    }
+    THTensor_(free)(tgradOutput);
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/LogSigmoid.c b/contrib/lua-torch/nn/lib/THNN/generic/LogSigmoid.c
new file mode 100644
index 000000000..651d56002
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/LogSigmoid.c
@@ -0,0 +1,36 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LogSigmoid.c"
+#else
+
+void THNN_(LogSigmoid_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *buffer)
+{
+  THTensor_(resizeAs)(output, input);
+  THTensor_(resizeAs)(buffer, input);
+
+  TH_TENSOR_APPLY3(real, output, real, input, real, buffer,
+    real z = exp(-*input_data);
+    *buffer_data = z;
+    *output_data = -log(1. + z);
+  );
+}
+
+void THNN_(LogSigmoid_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *buffer)
+{
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  THTensor_(resizeAs)(gradInput, buffer);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, buffer,
+    real z = *buffer_data;
+    *gradInput_data = *gradOutput_data * z / (1. + z);
+  );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/LogSoftMax.c b/contrib/lua-torch/nn/lib/THNN/generic/LogSoftMax.c
new file mode 100644
index 000000000..a7280422b
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/LogSoftMax.c
@@ -0,0 +1,137 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LogSoftMax.c"
+#else
+
+void THNN_(LogSoftMax_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  real *input_data, *output_data;
+  ptrdiff_t nframe = 0, dim = 0, stride = 0;
+  ptrdiff_t t, d;
+
+  if (input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+    stride = 1;
+  }
+  else if (input->nDimension == 2)
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    stride = 1;
+  }
+  else if (input->nDimension == 3)
+  {
+    nframe = 1;
+    dim = input->size[0];
+    stride = input->size[1]*input->size[2];
+  }
+  else if (input->nDimension == 4)
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    stride = input->size[2]*input->size[3];
+  }
+  else
+    THArgCheck(0, 2, "1D, 2D, 3D or 4D tensor expected");
+
+  input = THTensor_(newContiguous)(input);
+  THTensor_(resizeAs)(output, input);
+
+  real *input_data0 = THTensor_(data)(input);
+  real *output_data0 = THTensor_(data)(output);
+
+  accreal logsum;
+  real maxInput;
+  #pragma omp parallel for private(t, d, maxInput, logsum, input_data, output_data)
+  for (t = 0; t < stride*nframe; t++)
+  {
+    logsum = 0;
+    maxInput = -THInf;
+    input_data = input_data0 + (t/stride)*dim*stride + t % stride;
+    output_data = output_data0 + (t/stride)*dim*stride + t % stride;
+
+    for (d = 0; d < dim; d++)
+      maxInput = THMax(maxInput, input_data[d*stride]);
+
+    for (d = 0; d < dim; d++)
+      logsum += exp(input_data[d*stride] - maxInput);
+    logsum = maxInput + log(logsum);
+
+    for (d = 0; d < dim; d++)
+      output_data[d*stride] = input_data[d*stride] - logsum;
+  }
+
+  THTensor_(free)(input);
+}
+
+void THNN_(LogSoftMax_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output)
+{
+  THNN_CHECK_SHAPE(input, gradOutput);
+  real *gradInput_data, *gradOutput_data, *output_data;
+  ptrdiff_t nframe = 0, dim = 0, stride = 0;
+  ptrdiff_t t, d;
+
+  if (output->nDimension == 1)
+  {
+    nframe = 1;
+    dim = output->size[0];
+    stride = 1;
+  }
+  else if (output->nDimension == 2)
+  {
+    nframe = output->size[0];
+    dim = output->size[1];
+    stride = 1;
+  }
+  else if (output->nDimension == 3)
+  {
+    nframe = 1;
+    dim = output->size[0];
+    stride = output->size[1]*output->size[2];
+  }
+  else if (output->nDimension == 4)
+  {
+    nframe = output->size[0];
+    dim = output->size[1];
+    stride = output->size[2]*output->size[3];
+  }
+  else
+    THError("1D, 2D, 3D or 4D tensor expected");
+
+  output = THTensor_(newContiguous)(output);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  THTensor_(resizeAs)(gradInput, output);
+  real *gradInput_data0 = THTensor_(data)(gradInput);
+  real *output_data0 = THTensor_(data)(output);
+  real *gradOutput_data0 = THTensor_(data)(gradOutput);
+  accreal sum;
+  #pragma omp parallel for private(t, sum, d, gradInput_data, output_data, gradOutput_data)
+  for (t = 0; t < stride*nframe; t++)
+  {
+    sum = 0;
+    gradInput_data = gradInput_data0 + (t/stride)*dim*stride + t % stride;
+    output_data = output_data0 + (t/stride)*dim*stride + t % stride;
+    gradOutput_data = gradOutput_data0 + (t/stride)*dim*stride + t % stride;
+
+    for (d = 0; d < dim; d++)
+      sum += gradOutput_data[d*stride];
+
+    for (d = 0; d < dim; d++)
+      gradInput_data[d*stride] = gradOutput_data[d*stride] - exp(output_data[d*stride])*sum;
+  }
+
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(output);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/LookupTable.c b/contrib/lua-torch/nn/lib/THNN/generic/LookupTable.c
new file mode 100644
index 000000000..46bc2c3c1
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/LookupTable.c
@@ -0,0 +1,225 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LookupTable.c"
+#else
+
+static void THNN_(LookupTable_resetCount)(
+          THInteger_t *count_data,
+          THIndexTensor *input)
+{
+  ptrdiff_t i;
+  THIndex_t *input_data = THIndexTensor_(data)(input);
+  ptrdiff_t numel = THIndexTensor_(nElement)(input);
+
+  for (i = 0; i<numel; i++)
+  {
+    long k = input_data[i] - TH_INDEX_BASE;
+    count_data[k] = 0;
+  }
+  for (i = 0; i<numel; i++)
+  {
+    long k = input_data[i] - TH_INDEX_BASE;
+    count_data[k]++;
+  }
+}
+
+void THNN_(LookupTable_accGradParameters)(
+          THNNState *state,
+          THIndexTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THIntegerTensor *count,
+          THTensor *sorted,
+          THIndexTensor *indices,
+          bool scaleGradByFreq,
+          int paddingValue,
+          accreal ascale)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(ascale);
+  ptrdiff_t i;
+  THInteger_t *count_data = NULL;
+
+  if (scaleGradByFreq)
+  {
+    THIntegerTensor_(resize1d)(count, gradWeight->size[0]);
+    count_data = THIntegerTensor_(data)(count);
+  }
+
+  if (!THTensor_(isContiguous)(gradWeight))
+    THError("gradWeight must be contiguous");
+  if (!THIndexTensor_(isContiguous)(input))
+    THError("input must be contiguous");
+  if (THIndexTensor_(nDimension)(input) != 1 && THIndexTensor_(nDimension)(input) != 2) {
+    THDescBuff s1 = THIndexTensor_(sizeDesc)(input);
+    THError("input must be a vector or matrix, but is of shape: %s", s1.str);
+  }
+
+  THIndex_t *input_data = THIndexTensor_(data)(input);
+  ptrdiff_t numel = THIndexTensor_(nElement)(input);
+  long numw = THTensor_(size)(gradWeight, 0);
+
+  // check that inputs are all within range
+  for (i=0; i<numel; i++)
+    if (input_data[i] < TH_INDEX_BASE || input_data[i] >= numw + TH_INDEX_BASE) {
+      THError("inputs need to be in the range %ld <= input < %ld, "
+	      "but got input of value: %ld", TH_INDEX_BASE, (numw + TH_INDEX_BASE),
+	      input_data[i]);
+    }
+
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  real *gw = THTensor_(data)(gradWeight);
+  real *go = THTensor_(data)(gradOutput);
+  long stride = THTensor_(stride)(gradWeight, 0);
+
+  if (count_data)
+    THNN_(LookupTable_resetCount)(count_data, input);
+
+#ifdef _OPENMP
+  if (numel > 1000)
+  {
+    // The strategy is to parallelize over sections of the vocabulary, so that
+    // thread 1 handles updates to gradWeight[0..nVocab/nThreads]. Every thread
+    // has to traverse the entire input, but the dominating factor is the axpy
+    // BLAS call.
+    #pragma omp parallel private(i)
+    {
+      int tid = omp_get_thread_num();
+      int nthreads = omp_get_num_threads();
+
+      long start = tid * (numw/nthreads + 1);
+      long end = start + (numw/nthreads + 1);
+      for (i=0; i<numel; i++)
+      {
+        if (input_data[i] != paddingValue)
+        {
+            long k = input_data[i] - TH_INDEX_BASE;
+            if (k >= start && k < end)
+            {
+                real scale_ = scale;
+                if (count_data) scale_ /= count_data[k];
+                THBlas_(axpy)(stride, scale_, go + i*stride, 1, gw + k*stride, 1);
+            }
+        }
+      }
+    }
+
+    THTensor_(free)(gradOutput);
+    return;
+  }
+#endif
+
+  for (i=0; i<numel; i++)
+  {
+    if (input_data[i] != paddingValue)
+    {
+        long k = input_data[i] - TH_INDEX_BASE;
+        real scale_ = scale;
+        if (count_data) scale_ /= count_data[k];
+        THBlas_(axpy)(stride, scale_, go + i*stride, 1, gw + k*stride, 1);
+     }
+  }
+
+  THTensor_(free)(gradOutput);
+}
+
+/*
+ * Keep the norm of weight smaller than maxNorm
+ */
+
+static void THNN_(LookupTable_renormRow)(
+          real *row_data,
+          long stride,
+          real maxNorm,
+          real normType)
+{
+  real norm = 0;
+  real new_norm;
+  long j;
+  for (j=0; j<stride; j++)
+  {
+    if (normType == 1) {
+      norm += fabs(row_data[j]);
+    } else if (normType == 2) {
+      norm += row_data[j] * row_data[j];
+    } else {
+      norm += pow(fabs(row_data[j]), normType);
+    }
+  }
+  norm = pow(norm, 1.0 / normType);
+  if (norm > maxNorm)
+  {
+    new_norm = maxNorm / (norm + 1e-7);
+    for (j=0; j<stride; j++) {
+      row_data[j] *= new_norm;
+    }
+  }
+}
+
+static int THNN_(compare_THIndex)(const void* a, const void* b)
+{
+   return *(const THIndex_t*)a < *(const THIndex_t*)b ? -1 : 1;
+}
+
+void THNN_(LookupTable_renorm)(
+          THNNState *state,
+          THIndexTensor *idx,
+          THTensor *weight,
+          accreal maxNorm_,
+          accreal normType_)
+{
+  real maxNorm = TH_CONVERT_ACCREAL_TO_REAL(maxNorm_);
+  real normType = TH_CONVERT_ACCREAL_TO_REAL(normType_);
+  if (!THTensor_(isContiguous)(weight))
+    THError("weight must be contiguous");
+  if (!THIndexTensor_(isContiguous)(idx))
+    THError("input must be contiguous");
+  if (THIndexTensor_(nDimension)(idx) != 1)
+    THError("idx must be a vector");
+  if (normType <= 0)
+    THError("non-positive-norm not supported");
+
+  ptrdiff_t i;
+  THIndex_t *row_idx = THIndexTensor_(data)(idx);
+  ptrdiff_t numel = THIndexTensor_(nElement)(idx);
+
+  long numw = THTensor_(size)(weight, 0);
+  long stride = THTensor_(stride)(weight, 0);
+  real *gw = THTensor_(data)(weight);
+  for (i=0; i<numel; i++) {
+    if (row_idx[i] < TH_INDEX_BASE || row_idx[i] >= numw + TH_INDEX_BASE) {
+      THError("input need to be in the range %ld <= input < %ld, "
+	      "but got input of value: %ld", TH_INDEX_BASE, (numw + TH_INDEX_BASE),
+	      row_idx[i]);
+    }
+  }
+  // get unique indices
+  qsort(row_idx, numel, sizeof(THIndex_t), THNN_(compare_THIndex));
+  ptrdiff_t ptr = 0;
+  for (i=0; i<numel; i++)
+    if (i == 0 || row_idx[i] != row_idx[i-1])
+      row_idx[ptr++] = row_idx[i];
+  numel = ptr;
+
+#ifdef _OPENMP
+  if (numel > 1000)
+  {
+    // The strategy is to parallelize over the rows that appear in
+    // row_idx, so that thread 1 handles the rows in row_idx[0..numel/nThreads].
+    // This distributes the work evenly to each thread.
+    #pragma omp parallel for private(i)
+    for (i=0; i<numel; i++)
+    {
+      long k = row_idx[i] - TH_INDEX_BASE;
+      THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType);
+    }
+    return;
+  }
+#endif
+  for (i=0; i<numel; i++)
+  {
+    long k = row_idx[i] - TH_INDEX_BASE;
+    THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType);
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/MSECriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/MSECriterion.c
new file mode 100644
index 000000000..58911f6f0
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/MSECriterion.c
@@ -0,0 +1,45 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/MSECriterion.c"
+#else
+
+void THNN_(MSECriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage)
+{
+  THNN_CHECK_NELEMENT(input, target);
+  THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+
+  real sum = 0;
+
+  TH_TENSOR_APPLY2(real, input, real, target,
+    real z = (*input_data - *target_data);
+    sum += z*z;
+  );
+
+  if (sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(MSECriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage)
+{
+  THNN_CHECK_NELEMENT(input, target);
+
+  real norm = (sizeAverage ? 2./((real)THTensor_(nElement)(input)) : 2.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    *gradInput_data = norm * (*input_data - *target_data);
+  );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/MarginCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/MarginCriterion.c
new file mode 100644
index 000000000..d6d9b60b9
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/MarginCriterion.c
@@ -0,0 +1,47 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/MarginCriterion.c"
+#else
+
+void THNN_(MarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          accreal margin_)
+{
+  real margin = TH_CONVERT_ACCREAL_TO_REAL(margin_);
+  THNN_CHECK_NELEMENT(input, target);
+  THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+  real sum = 0;
+
+  TH_TENSOR_APPLY2(real, input, real, target,
+    real z = (margin - *input_data * *target_data);
+    sum += z>0 ? z : 0;
+  );
+
+  if (sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(MarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          accreal margin_)
+{
+  real margin = TH_CONVERT_ACCREAL_TO_REAL(margin_);
+  THNN_CHECK_NELEMENT(input, target);
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    *gradInput_data = (*input_data * *target_data) < margin ? -norm * *target_data : 0;
+  );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/MultiLabelMarginCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/MultiLabelMarginCriterion.c
new file mode 100644
index 000000000..16398c13c
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/MultiLabelMarginCriterion.c
@@ -0,0 +1,184 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/MultiLabelMarginCriterion.c"
+#else
+
+// TODO: improve error messages
+void THNN_(MultiLabelMarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *output,
+          THTensor *isTarget,
+          bool sizeAverage)
+{
+  real *input_data, *isTarget_data;
+  THIndex_t *target_data;
+  long nframe, dim;
+  long t, d, dt, ddt;
+  real sum;
+
+  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2,
+	     "vector or matrix expected");
+
+  if (input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+    THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3,
+	       "inconsistent target size");
+  }
+  else
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    THArgCheck((target->nDimension == 2) && (target->size[0] == nframe)
+	       && (target->size[1] == dim), 3, "inconsistent target size");
+  }
+
+  THArgCheck(THIndexTensor_(minall)(target) >= -1+TH_INDEX_BASE, 3, "target out of range");
+  THArgCheck(THIndexTensor_(maxall)(target) < dim+TH_INDEX_BASE, 3, "target out of range");
+
+  target = THIndexTensor_(newContiguous)(target);
+  input = THTensor_(newContiguous)(input);
+  input_data = THTensor_(data)(input);
+  target_data = THIndexTensor_(data)(target);
+
+  THNN_resizeAs_indices(isTarget, target);
+  THTensor_(zero)(isTarget);
+  isTarget_data = THTensor_(data)(isTarget);
+
+  sum = 0;
+  for (t = 0; t < nframe; t++)
+  {
+    for (ddt = 0; ddt < dim; ddt++)
+    {
+      THIndex_t target_idx = target_data[ddt] - TH_INDEX_BASE;
+      if (target_idx < 0)
+        break;
+      isTarget_data[target_idx] = 1;
+    }
+    for (dt = 0; dt < dim; dt++)
+    {
+      THIndex_t target_idx = target_data[dt] - TH_INDEX_BASE;
+      real input_target;
+      if (target_idx < 0)
+        break;
+
+      input_target = input_data[target_idx];
+      for (d = 0; d < dim; d++)
+      {
+        if (!isTarget_data[d])
+        {
+          real z = 1 - input_target + input_data[d];
+          if (z > 0)
+            sum += z;
+        }
+      }
+    }
+    input_data += dim;
+    target_data += dim;
+    isTarget_data += dim;
+  }
+
+  sum /= dim;
+  if (sizeAverage)
+    sum /= nframe;
+
+  THTensor_(set1d)(output, 0, sum);
+
+  THTensor_(free)(input);
+  THIndexTensor_(free)(target);
+}
+
+void THNN_(MultiLabelMarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *gradInput,
+          THTensor *isTarget,
+          bool sizeAverage)
+{
+  real *input_data;
+  real *gradInput_data;
+  THIndex_t *target_data;
+  real *isTarget_data;
+  long nframe, dim;
+  long t, d, dt;
+  real g;
+
+  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2,
+	     "vector or matrix expected");
+
+  if (input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+    THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3,
+	       "inconsistent target size");
+    THArgCheck((isTarget->nDimension == 1) && (isTarget->size[0] == dim), 3,
+	       "inconsistent isTarget size");
+  }
+  else
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    THArgCheck((target->nDimension == 2) && (target->size[0] == nframe)
+	       && (target->size[1] == dim), 3, "inconsistent target size");
+    THArgCheck((isTarget->nDimension == 2) && (isTarget->size[0] == nframe)
+	       && (isTarget->size[1] == dim), 3, "inconsistent isTarget size");
+  }
+
+  THArgCheck(THIndexTensor_(minall)(target) >= -1+TH_INDEX_BASE, 3, "target out of range");
+  THArgCheck(THIndexTensor_(maxall)(target) < dim+TH_INDEX_BASE, 3, "target out of range");
+
+  THArgCheck(THTensor_(minall)(isTarget) >= 0, 3, "isTarget out of range");
+  THArgCheck(THTensor_(maxall)(isTarget) <= 1, 3, "isTarget out of range");
+
+  target = THIndexTensor_(newContiguous)(target);
+  input = THTensor_(newContiguous)(input);
+  isTarget = THTensor_(newContiguous)(isTarget);
+  input_data = THTensor_(data)(input);
+  target_data = THIndexTensor_(data)(target);
+  isTarget_data = THTensor_(data)(isTarget);
+
+  g = sizeAverage ? ( 1./((real)(nframe*dim)) ) : ( 1./((real)dim) );
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+  gradInput_data = THTensor_(data)(gradInput);
+
+  for (t = 0; t < nframe; t++)
+  {
+    for (dt = 0; dt < dim; dt++)
+    {
+      THIndex_t target_idx = target_data[dt] - TH_INDEX_BASE;
+      real input_target;
+      if (target_idx < 0)
+        break;
+
+      input_target = input_data[target_idx];
+      for (d = 0; d < dim; d++)
+      {
+        if (!isTarget_data[d])
+        {
+          real z = 1 - input_target + input_data[d];
+          if (z > 0)
+          {
+            gradInput_data[target_idx] -= g;
+            gradInput_data[d] += g;
+          }
+        }
+      }
+    }
+    input_data += dim;
+    target_data += dim;
+    isTarget_data += dim;
+    gradInput_data += dim;
+  }
+
+  THTensor_(free)(input);
+  THIndexTensor_(free)(target);
+  THTensor_(free)(isTarget);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/MultiMarginCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/MultiMarginCriterion.c
new file mode 100644
index 000000000..2f8f8ff58
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/MultiMarginCriterion.c
@@ -0,0 +1,168 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/MultiMarginCriterion.c"
+#else
+
+// TODO: improve error messages
+void THNN_(MultiMarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          int p,
+          THTensor *weights,
+          accreal margin_)
+{
+  real margin = TH_CONVERT_ACCREAL_TO_REAL(margin_);
+  real *input_data, *weights_data;
+  THIndex_t *target_data;
+  long nframe, dim;
+  long t, d;
+  real sum;
+
+  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2,
+	     "vector or matrix expected");
+
+  if (input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+  }
+  else
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    THArgCheck((target->nDimension == 1) && (target->size[0] == nframe), 3,
+	       "inconsistent target size");
+  }
+
+  for (t = 0; t < nframe; t++)
+  {
+    THIndex_t idx = THIndexTensor_(get1d)(target, t);
+    THArgCheck((idx >= TH_INDEX_BASE) && (idx < dim + TH_INDEX_BASE), 3,
+	       "target out of range");
+  }
+
+  input = THTensor_(newContiguous)(input);
+  target = THIndexTensor_(newContiguous)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+  input_data = THTensor_(data)(input);
+  target_data = THIndexTensor_(data)(target);
+  weights_data = weights ? THTensor_(data)(weights) : NULL;
+
+  sum = 0;
+  for (t = 0; t < nframe; t++)
+  {
+    THIndex_t target_idx = target_data[t] - TH_INDEX_BASE;
+    real input_target = input_data[target_idx];
+    for (d = 0; d < dim; d++)
+    {
+      real z = margin - input_target + input_data[d];
+      if (d == target_idx)
+        continue;
+
+      if (z > 0) {
+        real h = (p==1) ? z : z*z;
+        if(weights_data)
+          h *= weights_data[target_idx];
+        sum += h;
+      }
+    }
+    input_data += dim;
+  }
+
+  sum /= dim;
+  if(sizeAverage)
+    sum /= nframe;
+
+  THTensor_(set1d)(output, 0, sum);
+
+  THTensor_(free)(input);
+  THIndexTensor_(free)(target);
+  if(weights)
+    THTensor_(free)(weights);
+}
+
+void THNN_(MultiMarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          int p,
+          THTensor *weights,
+          accreal margin_)
+{
+  real margin = TH_CONVERT_ACCREAL_TO_REAL(margin_);
+  real *input_data;
+  real *gradInput_data;
+  THIndex_t *target_data;
+  real *weights_data;
+  long nframe, dim;
+  long t, d;
+  real g;
+
+  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2,
+	     "vector or matrix expected");
+
+  if (input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+  }
+  else
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    THArgCheck((target->nDimension == 1) && (target->size[0] == nframe), 3,
+	       "inconsistent target size");
+  }
+
+  g = (sizeAverage ? 1./((real)(nframe*dim)) : 1./((real)dim));
+
+  input = THTensor_(newContiguous)(input);
+  target = THIndexTensor_(newContiguous)(target);
+  input_data = THTensor_(data)(input);
+
+  THTensor_(resizeAs)(gradInput, input);
+  gradInput_data = THTensor_(data)(gradInput);
+
+  target_data = THIndexTensor_(data)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+  weights_data = weights ? THTensor_(data)(weights) : NULL;
+
+  for (t = 0; t < nframe; t++)
+  {
+    THIndex_t target_idx = target_data[t] - TH_INDEX_BASE;
+    real input_target = input_data[target_idx];
+    real gradInput_target = 0;
+    for (d = 0; d < dim; d++)
+    {
+      real z = margin - input_target + input_data[d];
+      if (d == target_idx)
+        continue;
+
+      if (z > 0)
+      {
+        real h = (p == 1) ? g : 2*g*z;
+        if(weights_data)
+          h *= weights_data[target_idx];
+        gradInput_target -= h;
+        gradInput_data[d] = h;
+      }
+      else
+        gradInput_data[d] = 0;
+    }
+    gradInput_data[target_idx] = gradInput_target;
+
+    input_data += dim;
+    gradInput_data += dim;
+  }
+
+  THTensor_(free)(input);
+  THIndexTensor_(free)(target);
+  if(weights)
+    THTensor_(free)(weights);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/PReLU.c b/contrib/lua-torch/nn/lib/THNN/generic/PReLU.c
new file mode 100644
index 000000000..488322fde
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/PReLU.c
@@ -0,0 +1,207 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/PReLU.c"
+#else
+
+void THNN_(PReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THIndex_t nOutputPlane)
+{
+  THTensor_(resizeAs)(output, input);
+
+  if (nOutputPlane == 0)
+  {
+    // handle shared parameter case
+    real w = *THTensor_(data)(weight);
+    TH_TENSOR_APPLY2(real, output, real, input,
+      *output_data = (*input_data > 0) ? *input_data : w*(*input_data);
+    );
+  }
+  else
+  {
+    input = THTensor_(newContiguous)(input);
+    long bs = 1, ks = 1;
+    {
+      long input_ndim = THTensor_(nDimension)(input);
+      if (input->size[input_ndim > 1] != nOutputPlane)
+        THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[input_ndim > 1]);
+
+      if (input_ndim > 1) {
+          bs = input->size[0];
+          for (int d = 2; d < input_ndim; d++) {
+              ks *= input->size[d];
+          }
+      }
+    }
+
+    real *output_data = THTensor_(data)(output);
+    real *input_data = THTensor_(data)(input);
+    real *weight_data = THTensor_(data)(weight);
+    THIndex_t i, j, k;
+#pragma omp parallel for private(j,k)
+    for (i = 0; i < bs; ++i)
+    {
+      real* n_input_data = input_data + i*nOutputPlane*ks;
+      real* n_output_data = output_data + i*nOutputPlane*ks;
+      for (j = 0; j < nOutputPlane; ++j)
+      {
+        for (k = 0; k < ks; ++k)
+          n_output_data[k] = (n_input_data[k] > 0) ? n_input_data[k] : weight_data[j] * n_input_data[k];
+        n_input_data += ks;
+        n_output_data += ks;
+      }
+    }
+    THTensor_(free)(input);
+  }
+}
+
+void THNN_(PReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THIndex_t nOutputPlane)
+{
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  THTensor_(resizeAs)(gradInput, input);
+
+  if (nOutputPlane == 0)
+  {
+    real w = THTensor_(data)(weight)[0];
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+       if ((*input_data) > 0)
+         *gradInput_data = *gradOutput_data;
+       else
+         *gradInput_data = w * (*gradOutput_data);
+    );
+  }
+  else
+  {
+    input = THTensor_(newContiguous)(input);
+    gradOutput = THTensor_(newContiguous)(gradOutput);
+    weight = THTensor_(newContiguous)(weight);
+    const real *input_data = THTensor_(data)(input);
+    const real *gradOutput_data = THTensor_(data)(gradOutput);
+    const real *weight_data = THTensor_(data)(weight);
+    real *gradInput_data = THTensor_(data)(gradInput);
+
+    long bs = 1, ks = 1;
+    {
+      long input_ndim = THTensor_(nDimension)(input);
+      if (input->size[input_ndim > 1] != nOutputPlane)
+        THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[input_ndim > 1]);
+
+      if (input_ndim > 1) {
+          bs = input->size[0];
+          for (int d = 2; d < input_ndim; d++) {
+              ks *= input->size[d];
+          }
+      }
+    }
+
+    THIndex_t i, j, k;
+#pragma omp parallel for private(j,k)
+    for (i = 0; i < bs; ++i)
+    {
+      const real *n_input_data = input_data + i*nOutputPlane*ks;
+      const real *n_gradOutput_data = gradOutput_data + i*nOutputPlane*ks;
+      real *n_gradInput_data = gradInput_data + i*nOutputPlane*ks;
+
+      for (j = 0; j < nOutputPlane; ++j)
+      {
+        real w = weight_data[j];
+        for (k = 0; k < ks; ++k)
+        {
+          if (n_input_data[k] > 0)
+            n_gradInput_data[k] = n_gradOutput_data[k];
+          else
+            n_gradInput_data[k] = n_gradOutput_data[k] * w;
+        }
+        n_input_data += ks;
+        n_gradInput_data += ks;
+        n_gradOutput_data += ks;
+      }
+    }
+    THTensor_(free)(input);
+    THTensor_(free)(gradOutput);
+    THTensor_(free)(weight);
+  }
+}
+
+void THNN_(PReLU_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradWeight,
+          THTensor *gradWeightBuf,
+          THTensor *gradWeightBuf2,
+          THIndex_t nOutputPlane,
+          accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  THNN_CHECK_NELEMENT(input, gradOutput);
+
+  if (nOutputPlane == 0)
+  {
+    real *gradWeight_data = THTensor_(data)(gradWeight);
+    real sum = 0;
+    TH_TENSOR_APPLY2(real, input, real, gradOutput,
+      if ((*input_data) <= 0)
+        sum += (*input_data) * (*gradOutput_data);
+    );
+    gradWeight_data[0] += scale * sum;
+  }
+  else
+  {
+    THArgCheck(THTensor_(isContiguous)(gradWeight), 6, "gradWeight needs to be contiguous");
+    input = THTensor_(newContiguous)(input);
+    gradOutput = THTensor_(newContiguous)(gradOutput);
+    weight = THTensor_(newContiguous)(weight);
+    long bs = 1, ks = 1;
+    {
+      long input_ndim = THTensor_(nDimension)(input);
+      if (input->size[input_ndim > 1] != nOutputPlane)
+        THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[input_ndim > 1]);
+
+      if (input_ndim > 1) {
+          bs = input->size[0];
+          for (int d = 2; d < input_ndim; d++) {
+            ks *= input->size[d];
+          }
+      }
+    }
+
+    const real *input_data = THTensor_(data)(input);
+    const real *gradOutput_data = THTensor_(data)(gradOutput);
+    const real *weight_data = THTensor_(data)(weight);
+    real *gradWeight_data = THTensor_(data)(gradWeight);
+
+    THIndex_t i, j, k;
+    for (i = 0; i < bs; ++i)
+    {
+      const real *n_input_data = input_data + i*nOutputPlane*ks;
+      const real *n_gradOutput_data = gradOutput_data + i*nOutputPlane*ks;
+
+      for (j = 0; j < nOutputPlane; ++j)
+      {
+        real sum = 0;
+        for (k = 0; k < ks; ++k)
+          if (n_input_data[k] <= 0)
+            sum += n_gradOutput_data[k] * n_input_data[k];
+        gradWeight_data[j] += scale * sum;
+        n_input_data += ks;
+        n_gradOutput_data += ks;
+      }
+    }
+    THTensor_(free)(input);
+    THTensor_(free)(gradOutput);
+    THTensor_(free)(weight);
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/RReLU.c b/contrib/lua-torch/nn/lib/THNN/generic/RReLU.c
new file mode 100644
index 000000000..8fd46d3c2
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/RReLU.c
@@ -0,0 +1,132 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/RReLU.c"
+#else
+
+void THNN_(RReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *noise,
+          accreal lower_,
+          accreal upper_,
+          bool train,
+          bool inplace,
+          THGenerator *generator)
+{
+  real lower = TH_CONVERT_ACCREAL_TO_REAL(lower_);
+  real upper = TH_CONVERT_ACCREAL_TO_REAL(upper_);
+  if (train)
+  {
+    // get default random generator
+    THTensor_(resizeAs)(noise, input);
+    if (inplace)
+    {
+      TH_TENSOR_APPLY2(real, input, real, noise,
+        if (*input_data <= 0)
+        {
+          const real r = (real)THRandom_uniform(generator, lower, upper);
+          *input_data = (*input_data) * r;
+          *noise_data = r;
+        }
+        else
+        {
+          *noise_data = 1;
+        }
+      );
+      THTensor_(set)(output, input);
+    }
+    else
+    {
+      THTensor_(resizeAs)(output, input);
+      TH_TENSOR_APPLY3(real, input, real, output, real, noise,
+        if (*input_data <= 0)
+        {
+          const real r = (real)THRandom_uniform(generator, lower, upper);
+          *output_data = (*input_data) * r;
+          *noise_data = r;
+        }
+        else
+        {
+          *output_data = *input_data;
+          *noise_data = 1;
+        }
+      );
+    }
+  }
+  else
+  {
+    const real negSlope = (lower + upper) / 2;
+    if (inplace)
+    {
+      TH_TENSOR_APPLY(real, input,
+        if (*input_data <= 0)
+        {
+          *input_data = *input_data * negSlope;
+        }
+      );
+      THTensor_(set)(output, input);
+    }
+    else
+    {
+      THTensor_(resizeAs)(output, input);
+      TH_TENSOR_APPLY2(real, input, real, output,
+        const real r = (*input_data) <= 0 ? negSlope : 1;
+        *output_data = *input_data * r;
+      );
+    }
+  }
+}
+
+void THNN_(RReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *noise,
+          accreal lower_,
+          accreal upper_,
+          bool train,
+          bool inplace)
+{
+  real lower = TH_CONVERT_ACCREAL_TO_REAL(lower_);
+  real upper = TH_CONVERT_ACCREAL_TO_REAL(upper_);
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  if (train && upper - lower > 1E-6)    // e.g. if upper == lower, RReLU behaves like LeakyReLU
+  {
+    // multiply the gradient by the noise tensor
+    if (inplace)
+    {
+      THTensor_(cmul)(gradOutput, gradOutput, noise);
+      THTensor_(set)(gradInput, gradOutput);
+    }
+    else
+    {
+      THTensor_(resizeAs)(gradInput, input);
+      THTensor_(cmul)(gradInput, gradOutput, noise);
+    }
+  }
+  else
+  {
+    // use constant factor for negative input values
+    const real negSlope = (lower + upper) / 2;
+    if (inplace)
+    {
+      TH_TENSOR_APPLY2(real, gradOutput, real, input,
+        if (*input_data <= 0)
+        {
+          *gradOutput_data = (*gradOutput_data) * negSlope;
+        }
+      );
+      THTensor_(set)(gradInput, gradOutput);
+    }
+    else
+    {
+      THTensor_(resizeAs)(gradInput, input);
+      TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+        *gradInput_data = (*input_data) <= 0 ? (*gradOutput_data) * negSlope : (*gradOutput_data);
+      );
+    }
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/Sigmoid.c b/contrib/lua-torch/nn/lib/THNN/generic/Sigmoid.c
new file mode 100644
index 000000000..17fb2cb4d
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/Sigmoid.c
@@ -0,0 +1,28 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Sigmoid.c"
+#else
+
+void THNN_(Sigmoid_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  THTensor_(sigmoid)(output, input);
+}
+
+void THNN_(Sigmoid_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output)
+{
+  THNN_CHECK_NELEMENT(output, gradOutput);
+  THTensor_(resizeAs)(gradInput, output);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+    real z = *output_data;
+    *gradInput_data = *gradOutput_data * (1. - z) * z;
+  );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SmoothL1Criterion.c b/contrib/lua-torch/nn/lib/THNN/generic/SmoothL1Criterion.c
new file mode 100644
index 000000000..d1928d11c
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SmoothL1Criterion.c
@@ -0,0 +1,49 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SmoothL1Criterion.c"
+#else
+
+void THNN_(SmoothL1Criterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage)
+{
+  THNN_CHECK_NELEMENT(input, target);
+  THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+
+  real sum = 0;
+  TH_TENSOR_APPLY2(real, input, real, target,
+    real z = fabs(*input_data - *target_data);
+    sum += z < 1 ? 0.5*z*z : z - 0.5;
+  );
+
+  if (sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(SmoothL1Criterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage)
+{
+  THNN_CHECK_NELEMENT(input, target);
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    real x = *input_data - *target_data;
+    if (x < -1.)
+     *gradInput_data = - norm;
+    else if (x > 1.)
+     *gradInput_data = norm;
+    else
+     *gradInput_data = norm * x;
+  );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SoftMarginCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/SoftMarginCriterion.c
new file mode 100644
index 000000000..bac0a3b53
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SoftMarginCriterion.c
@@ -0,0 +1,44 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SoftMarginCriterion.c"
+#else
+
+void THNN_(SoftMarginCriterion_updateOutput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *target,
+  THTensor *output,
+  bool sizeAverage)
+{
+  THNN_CHECK_NELEMENT(input, target);
+  THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+
+  real sum;
+
+  sum = 0;
+  TH_TENSOR_APPLY2(real, input, real, target,
+                   real z = log(1. + exp(-*input_data* *target_data));
+                   sum += z;)
+
+  if(sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(SoftMarginCriterion_updateGradInput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *target,
+  THTensor *gradInput,
+  bool sizeAverage)
+{
+  THNN_CHECK_NELEMENT(input, target);
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+                   real z = exp(-*target_data * *input_data);
+                   *gradInput_data = -norm*(*target_data)*z/(1. + z);)
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SoftMax.c b/contrib/lua-torch/nn/lib/THNN/generic/SoftMax.c
new file mode 100644
index 000000000..7b60d64c2
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SoftMax.c
@@ -0,0 +1,150 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SoftMax.c"
+#else
+
+void THNN_(SoftMax_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  real *input_data, *output_data;
+  ptrdiff_t nframe = 0, dim = 0, stride = 0;
+  ptrdiff_t t;
+
+  if (input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+    stride = 1;
+  }
+  else if (input->nDimension == 2)
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    stride = 1;
+  }
+  else if (input->nDimension == 3)
+  {
+    nframe = 1;
+    dim = input->size[0];
+    stride = input->size[1]*input->size[2];
+  }
+  else if (input->nDimension == 4)
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    stride = input->size[2]*input->size[3];
+  }
+  else
+  {
+    THArgCheck(0, 2, "1D, 2D, 3D or 4D tensor expected");
+  }
+
+  input = THTensor_(newContiguous)(input);
+  THTensor_(resizeAs)(output, input);
+
+  input_data = THTensor_(data)(input);
+  output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(t)
+  for (t = 0; t < stride*nframe; t++)
+  {
+    real *input_ptr = input_data + (t/stride)*dim*stride + t % stride;
+    real *output_ptr = output_data + (t/stride)*dim*stride + t % stride;
+
+    real inputMax = -THInf;
+    accreal sum;
+
+    ptrdiff_t d;
+    for (d = 0; d < dim; d++)
+    {
+      if (input_ptr[d*stride] >= inputMax) inputMax = input_ptr[d*stride];
+    }
+
+    sum = 0;
+    for (d = 0; d < dim; d++)
+    {
+      real z = exp(input_ptr[d*stride] - inputMax);
+      output_ptr[d*stride] = z;
+      sum += z;
+    }
+
+    for (d = 0; d < dim; d++)
+    {
+      output_ptr[d*stride] *= 1/sum;
+    }
+  }
+
+  THTensor_(free)(input);
+}
+
+void THNN_(SoftMax_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output)
+{
+  THNN_CHECK_SHAPE(input, gradOutput);
+  real *gradInput_data, *gradOutput_data, *output_data;
+  ptrdiff_t nframe = 0, dim = 0, stride = 0;
+  ptrdiff_t t;
+
+  if (output->nDimension == 1)
+  {
+    nframe = 1;
+    dim = output->size[0];
+    stride = 1;
+  }
+  else if (output->nDimension == 2)
+  {
+    nframe = output->size[0];
+    dim = output->size[1];
+    stride = 1;
+  }
+  else if (output->nDimension == 3)
+  {
+    nframe = 1;
+    dim = output->size[0];
+    stride = output->size[1]*output->size[2];
+  }
+  else if (output->nDimension == 4)
+  {
+    nframe = output->size[0];
+    dim = output->size[1];
+    stride = output->size[2]*output->size[3];
+  }
+  else
+  {
+    THError("1D, 2D, 3D or 4D tensor expected");
+  }
+
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  output = THTensor_(newContiguous)(output);
+
+  THTensor_(resizeAs)(gradInput, output);
+  gradInput_data = THTensor_(data)(gradInput);
+  output_data = THTensor_(data)(output);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+#pragma omp parallel for private(t)
+  for (t = 0; t < stride*nframe; t++)
+  {
+    real *gradInput_ptr = gradInput_data + (t/stride)*dim*stride + t % stride;
+    real *output_ptr = output_data + (t/stride)*dim*stride + t % stride;
+    real *gradOutput_ptr = gradOutput_data + (t/stride)*dim*stride + t % stride;
+
+    ptrdiff_t d;
+    accreal sum = 0;
+    for (d = 0; d < dim; d++)
+      sum += (accreal)gradOutput_ptr[d*stride] * output_ptr[d*stride];
+
+    for (d = 0; d < dim; d++)
+      gradInput_ptr[d*stride] = output_ptr[d*stride] * (gradOutput_ptr[d*stride] - sum);
+  }
+
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(output);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SoftPlus.c b/contrib/lua-torch/nn/lib/THNN/generic/SoftPlus.c
new file mode 100644
index 000000000..6491e66d6
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SoftPlus.c
@@ -0,0 +1,47 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SoftPlus.c"
+#else
+
+void THNN_(SoftPlus_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal beta_,
+          accreal threshold_)
+{
+  real beta = TH_CONVERT_ACCREAL_TO_REAL(beta_);
+  real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_);
+  THTensor_(resizeAs)(output, input);
+
+  // f(x) = 1/beta * log(1 + exp(beta * x))
+  TH_TENSOR_APPLY2(real, output, real, input,               \
+    *output_data = (*input_data * beta) > threshold ? *input_data : THLog1p(exp(*input_data * beta)) / beta;
+  );
+}
+
+void THNN_(SoftPlus_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output,
+          accreal beta_,
+          accreal threshold_)
+{
+  real beta = TH_CONVERT_ACCREAL_TO_REAL(beta_);
+  real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_);
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  THTensor_(resizeAs)(gradInput, output);
+
+  // d/dx[log(1+exp(k*x))/k] = exp(kx) / (exp(kx) + 1)
+  // SINCE
+  // y = (1/k)*log(1+exp(k*x)) --> x = (1/k)*log(exp(k*y)-1)
+  // THEREFORE:
+  // d/dx(f(x)) = (exp(k*y) - 1) / exp(k*y)
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+    real z = exp(*output_data * beta);
+    *gradInput_data = (*output_data * beta) > threshold ? *gradOutput_data : *gradOutput_data * (z - 1.)/z;
+  );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SoftShrink.c b/contrib/lua-torch/nn/lib/THNN/generic/SoftShrink.c
new file mode 100644
index 000000000..e77950868
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SoftShrink.c
@@ -0,0 +1,42 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SoftShrink.c"
+#else
+
+void THNN_(SoftShrink_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal lambda_)
+{
+  real lambda = TH_CONVERT_ACCREAL_TO_REAL(lambda_);
+  THTensor_(resizeAs)(output, input);
+
+  TH_TENSOR_APPLY2(real, output, real, input,
+    if ((*input_data) > lambda)
+     *output_data = *input_data - lambda;
+    else if ((*input_data) < -lambda)
+     *output_data = *input_data + lambda;
+    else
+     *output_data = 0;
+  );
+}
+
+void THNN_(SoftShrink_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          accreal lambda_)
+{
+  real lambda = TH_CONVERT_ACCREAL_TO_REAL(lambda_);
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+    if ((*input_data) > lambda || (*input_data) < -lambda)
+      *gradInput_data = (*gradOutput_data);
+    else
+      *gradInput_data = 0;
+  );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SparseLinear.c b/contrib/lua-torch/nn/lib/THNN/generic/SparseLinear.c
new file mode 100644
index 000000000..1cf712212
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SparseLinear.c
@@ -0,0 +1,564 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SparseLinear.c"
+#else
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#define ROW_PTR2(t, r) (THTensor_(data)(t) + (r) * (t)->stride[0])
+#define COL_PTR2(t, c) (THTensor_(data)(t) + (c) * (t)->stride[1])
+
+static bool THNN_(checkLegacyInput)(THTensor* t)
+{
+  return t->nDimension == 3 && t->size[2] == 2;
+}
+
+static bool THNN_(checkInput)(THTensor* t)
+{
+  return t->nDimension == 2 && t->size[1] == 3;
+}
+
+static bool THNN_(checkSize2D)(THTensor* t, long size0, long size1)
+{
+  return t->nDimension == 2 && t->size[0] == size0 && t->size[1] == size1;
+}
+
+static bool THNN_(checkSize1D)(THTensor* t, long size0)
+{
+  return t->nDimension == 1 && t->size[0] == size0;
+}
+
+static void THNN_(set1d)(THTensor *t, long x0, real value) {
+  THStorage_(set)(t->storage, t->storageOffset + x0*t->stride[0], value);
+}
+static real THNN_(get3d)(const THTensor *t, long x0, long x1, long x2) {
+  return THStorage_(get)(t->storage, t->storageOffset +
+                         x0*t->stride[0] + x1*t->stride[1] + x2*t->stride[2]);
+}
+static real THNN_(get2d)(const THTensor *t, long x0, long x1) {
+  return THStorage_(get)(t->storage, t->storageOffset +
+                         x0*t->stride[0] + x1*t->stride[1]);
+}
+
+void THNN_(SparseLinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias)
+{
+  long h, i, j, hp0, hp1;
+  long outDim = THTensor_(size)(weight, 0);
+  long inDim = THTensor_(size)(weight, 1);
+  long batchSize = THTensor_(size)(output, 0);
+
+  THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3");
+  THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
+  THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");
+
+  long nnz = THTensor_(size)(input, 0);
+
+  THLongTensor * csr = THLongTensor_newWithSize1d(batchSize+1);
+  THLongTensor_zero(csr);
+
+  weight = THTensor_(newContiguous)(weight);
+
+//#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
+  for (i=0; i<nnz; i++) {
+    hp0 = (long)(THNN_(get2d)(input, i, 0)) - 1;
+    hp1 = (i+1 == nnz) ?
+            batchSize :
+            (long)(THNN_(get2d)(input, i+1, 0)) - 1;
+    if (hp0 != hp1) for (h = hp0; h < hp1; h++) {
+      THLongTensor_set1d(csr, h+1, i+1);
+    }
+  }
+
+
+  // output = weight * input + bias
+  THTensor_(zero)(output);
+#pragma omp parallel for private(h, i) schedule(static) if (nnz > 10000)
+  for (h = 0; h < batchSize; h++) {
+    long i_start = THLongTensor_get1d(csr, h);
+    long i_end = THLongTensor_get1d(csr, h+1);
+    for (i = i_start; i < i_end; i++) {
+      real val = THNN_(get2d)(input, i, 2);
+      if (val == 0) {
+        continue;
+      }
+
+      long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THBlas_(axpy)(outDim,
+            val,
+            COL_PTR2(weight, offset), weight->stride[0],
+            ROW_PTR2(output, h), output->stride[1]);
+      } else {
+        THError("index out of bound. updateOutput: %d not between 1 and %d",
+            offset + 1, inDim);
+      }
+    }
+  }
+
+  THTensor* output_row = THTensor_(new)();
+  for (h = 0; h < batchSize; h++) {
+    THTensor_(select)(output_row, output, 0, h);
+    THTensor_(cadd)(output_row, bias, 1.0, output_row);
+  }
+  THTensor_(free)(output_row);
+  THLongTensor_free(csr);
+  THTensor_(free)(weight);
+}
+
+void THNN_(SparseLinear_legacyUpdateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias)
+{
+  long h, i;
+  long outDim = THTensor_(size)(weight, 0);
+  long inDim = THTensor_(size)(weight, 1);
+
+  THArgCheck(THNN_(checkLegacyInput)(input), 2, "input size must be batchsize x nnz x 2");
+  THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
+  THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");
+
+  weight = THTensor_(newContiguous)(weight);
+
+  long batchSize = THTensor_(size)(input, 0);
+  long nnz = THTensor_(size)(input, 1);
+  THTensor_(resize2d)(output, batchSize, outDim);
+
+  // output = weight * input + bias
+  THTensor_(zero)(output);
+#pragma omp parallel for private(h, i) schedule(static) if (   \
+  batchSize > 1 && batchSize * nnz * outDim > 10000)
+  for (h = 0; h < batchSize; h++) {
+    for (i = 0; i < nnz; i++) {
+      real val = THNN_(get3d)(input, h, i, 1);
+      if (val == 0) {
+        continue;
+      }
+
+      long offset = (long)(THNN_(get3d)(input, h, i, 0)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THBlas_(axpy)(outDim,
+                      val,
+                      COL_PTR2(weight, offset), weight->stride[0],
+                      ROW_PTR2(output, h), output->stride[1]);
+      } else {
+        THError("index out of bound. updateOutput: %d not between 1 and %d",
+                offset + 1, inDim);
+      }
+    }
+  }
+
+  THTensor* output_row = THTensor_(new)();
+  for (h = 0; h < batchSize; h++) {
+    THTensor_(select)(output_row, output, 0, h);
+    THTensor_(cadd)(output_row, bias, 1.0, output_row);
+  }
+  THTensor_(free)(output_row);
+  THTensor_(free)(weight);
+}
+
+void THNN_(SparseLinear_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          accreal weightDecay_,
+          accreal scale_)
+{
+  real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  long h, i, col, hp0, hp1;
+  long outDim = THTensor_(size)(weight, 0);
+  long inDim = THTensor_(size)(weight, 1);
+
+  THArgCheck(THNN_(checkInput)(input), 2,
+             "input must be in coo format, nnz x 3");
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+             "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5,
+             "gradBias size wrong");
+  THArgCheck(THTensor_(isContiguous)(gradOutput), 1,
+             "gradOutput must be contiguous");
+
+  long nnz = THTensor_(size)(input, 0);
+
+  THLongTensor* csc = THLongTensor_newWithSize1d(inDim+1);
+  THLongTensor_zero(csc);
+  weight = THTensor_(newContiguous)(weight);
+
+#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
+  for (i = 0; i < nnz; i++) {
+    hp0 = (long)(THNN_(get2d)(input, i, 1)) - 1;
+    hp1 = (i+1 == nnz) ?
+            inDim :
+            (long)(THNN_(get2d)(input, i+1, 1)) - 1;
+    if (hp0 != hp1) for (h = hp0; h < hp1; h++) {
+      THLongTensor_set1d(csc, h+1, i+1);
+    }
+  }
+
+  // gradWeight += gradOutput * input
+#pragma omp parallel for private(h, i, col) schedule(static) if (nnz > 10000)
+  for (col = 0; col < inDim; col++) {
+    long i_start = THLongTensor_get1d(csc, col);
+    long i_end = THLongTensor_get1d(csc, col+1);
+    for (i = i_start; i < i_end; i++) {
+      real val = scale * THNN_(get2d)(input, i, 2);
+
+      h = (long)(THNN_(get2d)(input, i, 0)) - 1;
+      long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THBlas_(axpy)(outDim,
+            val,
+            ROW_PTR2(gradOutput, h), gradOutput->stride[1],
+            COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
+      } else {
+        THError(
+            "index out of bound. accGradParameters: %d not between 1 and %d",
+            offset + 1,
+            inDim);
+      }
+    }
+  }
+
+  // gradBias += gradOutput
+  THTensor* buf = THTensor_(new)();
+  THTensor_(sum)(buf, gradOutput, 0, 1);
+  THTensor_(cadd)(gradBias, gradBias, scale, buf);
+  THTensor_(free)(buf);
+  THLongTensor_free(csc);
+
+  if (weightDecay != 0) {
+    THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
+  }
+  THTensor_(free)(weight);
+}
+
+void THNN_(SparseLinear_legacyAccGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          accreal weightDecay_,
+          accreal scale_)
+{
+  real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  long h, i;
+  long outDim = THTensor_(size)(weight, 0);
+  long inDim = THTensor_(size)(weight, 1);
+
+  THArgCheck(THNN_(checkLegacyInput)(input), 2,
+             "input size must be batchsize x nnz x 2");
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+             "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5,
+             "gradBias size wrong");
+  THArgCheck(THTensor_(isContiguous)(gradOutput), 1,
+             "gradOutput must be contiguous");
+
+  long batchSize = THTensor_(size)(input, 0);
+  long nnz = THTensor_(size)(input, 1);
+  THTensor_(resize2d)(gradOutput, batchSize, outDim);
+
+  // gradWeight += gradOutput * input
+#pragma omp parallel for private(h, i) schedule(static) if (\
+  batchSize * nnz * outDim > 10000)
+  for (i = 0; i < nnz; i++) {
+    for (h = 0; h < batchSize; h++) {
+      real val = scale * THNN_(get3d)(input, h, i, 1);
+      if (val == 0) {
+        continue;
+      }
+
+      long offset = (long)(THNN_(get3d)(input, h, i, 0)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THBlas_(axpy)(outDim,
+                      val,
+                      ROW_PTR2(gradOutput, h), gradOutput->stride[1],
+                      COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
+      } else {
+        THError(
+          "index out of bound. accGradParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+      }
+    }
+  }
+
+  // gradBias += gradOutput
+  THTensor* gradOutput_row = THTensor_(new)();
+  for (h = 0; h < batchSize; h++) {
+    THTensor_(select)(gradOutput_row, gradOutput, 0, h);
+    THTensor_(cadd)(gradBias, gradBias, scale, gradOutput_row);
+  }
+  THTensor_(free)(gradOutput_row);
+
+  if (weightDecay != 0) {
+    THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
+  }
+}
+
+void THNN_(SparseLinear_updateParameters)(
+          THNNState *state,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput,
+          accreal learningRate_)
+{
+  real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_);
+  long h, i;
+  long outDim = weight->size[0];
+  long inDim = weight->size[1];
+
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+             "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
+  THArgCheck(THNN_(checkInput)(lastInput), 6,
+             "input must be in coo format, nnz x 3");
+
+
+  long nnz = THTensor_(size)(lastInput, 0);
+
+  // collect unique offsets of non-0 val in input
+  THTensor* offsets = THTensor_(newWithSize1d)(nnz);
+  long cnt = 0;
+  for (i = 0; i < nnz; i++) {
+    real val = THNN_(get2d)(lastInput, i, 2);
+    if (val == 0) {
+      continue;
+    }
+    long offset = (long)(THNN_(get2d)(lastInput, i, 1)) - 1;
+    if (offset >= 0 && offset < inDim) {
+      THNN_(set1d)(offsets, cnt++, offset);
+    } else {
+      THError(
+          "index out of bound. updateParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+    }
+  }
+  if (cnt == 0) return;
+  THTensor_(resize1d)(offsets, cnt);
+
+  THTensor* uniqueOffsets = THTensor_(new)();
+  THLongTensor* ri = THLongTensor_new();
+  THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0);
+  THLongTensor_free(ri);
+  THTensor_(free)(offsets);
+
+  cnt = 1;
+  real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets);
+  for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) {
+    if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) {
+      uniqueOffsets_p[cnt++] = uniqueOffsets_p[i];
+    }
+  }
+  THTensor_(resize1d)(uniqueOffsets, cnt);
+
+  // weight += -learningRate * gradWeight
+  THTensor_(cadd)(bias, bias, -learningRate, gradBias);
+#pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000)
+  for (i = 0; i < cnt; i++) {
+    long offset = (long)uniqueOffsets_p[i];
+    THBlas_(axpy)(outDim,
+                  -learningRate,
+                  COL_PTR2(gradWeight, offset), gradWeight->stride[0],
+                  COL_PTR2(weight, offset), weight->stride[0]);
+  }
+
+  THTensor_(free)(uniqueOffsets);
+}
+
+void THNN_(SparseLinear_legacyUpdateParameters)(
+          THNNState *state,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput,
+          accreal learningRate_)
+{
+  real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_);
+  long h, i;
+  long outDim = weight->size[0];
+  long inDim = weight->size[1];
+
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+             "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
+  THArgCheck(THNN_(checkLegacyInput)(lastInput), 6,
+             "input size must be batchsize x nnz x 2");
+
+
+  long batchSize = THTensor_(size)(lastInput, 0);
+  long nnz = THTensor_(size)(lastInput, 1);
+
+  // collect unique offsets of non-0 val in input
+  THTensor* offsets = THTensor_(newWithSize1d)(batchSize * nnz);
+  long cnt = 0;
+  for (h = 0; h < batchSize; h++) {
+    for (i = 0; i < nnz; i++) {
+      real val = THNN_(get3d)(lastInput, h, i, 1);
+      if (val == 0 ) {
+        continue;
+      }
+      long offset = (long)(THNN_(get3d)(lastInput, h, i, 0)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THNN_(set1d)(offsets, cnt++, offset);
+      } else {
+        THError(
+          "index out of bound. updateParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+      }
+    }
+  }
+  THTensor_(resize1d)(offsets, cnt);
+
+  THTensor* uniqueOffsets = THTensor_(new)();
+  THLongTensor* ri = THLongTensor_new();
+  THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0);
+  THLongTensor_free(ri);
+  THTensor_(free)(offsets);
+
+  cnt = 1;
+  real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets);
+  for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) {
+    if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) {
+      uniqueOffsets_p[cnt++] = uniqueOffsets_p[i];
+    }
+  }
+  THTensor_(resize1d)(uniqueOffsets, cnt);
+
+  // weight += -learningRate * gradWeight
+  THTensor_(cadd)(bias, bias, -learningRate, gradBias);
+#pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000)
+  for (i = 0; i < cnt; i++) {
+    long offset = (long)uniqueOffsets_p[i];
+    THBlas_(axpy)(outDim,
+                  -learningRate,
+                  COL_PTR2(gradWeight, offset), gradWeight->stride[0],
+                  COL_PTR2(weight, offset), weight->stride[0]);
+  }
+
+  THTensor_(free)(uniqueOffsets);
+}
+
+void THNN_(SparseLinear_zeroGradParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput)
+{
+  long h, i, j;
+
+  long outDim = gradWeight->size[0];
+  long inDim = gradWeight->size[1];
+
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong");
+  THArgCheck(THNN_(checkInput)(lastInput), 4,
+             "input must be in coo format, nnz x 3");
+
+  THTensor_(zero)(gradBias);
+
+  long nnz = THTensor_(size)(lastInput, 0);
+
+#pragma omp parallel for private(i, j) schedule(static) if (   \
+  nnz * outDim > 10000)
+  for (i = 0; i < nnz; i++) {
+    if (THNN_(get2d)(lastInput, i, 2) == 0 ) {
+      continue;
+    }
+
+    long offset = (long)(THNN_(get2d)(lastInput, i, 1)) - 1;
+    if (offset >= 0 && offset < inDim) {
+      real* pGradWeight = COL_PTR2(gradWeight, offset);
+      if (gradWeight->stride[0] == 1) {
+        THVector_(fill)(pGradWeight, 0, outDim);
+      } else {
+        long stride = gradWeight->stride[0];
+        for (j = 0; j < outDim; ++j) {
+          pGradWeight[j * stride] = 0;
+        }
+      }
+    } else {
+      THError(
+          "index out of bound. zeroGradParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+    }
+  }
+}
+
+void THNN_(SparseLinear_legacyZeroGradParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput)
+{
+  long h, i, j;
+
+  long outDim = gradWeight->size[0];
+  long inDim = gradWeight->size[1];
+
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong");
+  THArgCheck(THNN_(checkLegacyInput)(lastInput), 4,
+             "input size must be batchsize x nnz x 2");
+
+  THTensor_(zero)(gradBias);
+
+  long batchSize = THTensor_(size)(lastInput, 0);
+  long nnz = THTensor_(size)(lastInput, 1);
+
+#pragma omp parallel for private(h, i, j) schedule(static) if (   \
+  batchSize > 1 && batchSize * nnz * outDim > 10000)
+  for (h = 0; h < batchSize; h++) {
+    for (i = 0; i < nnz; i++) {
+      if (THNN_(get3d)(lastInput, h, i, 1) == 0 ) {
+        continue;
+      }
+
+      long offset = (long)(THNN_(get3d)(lastInput, h, i, 0)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        real* pGradWeight = COL_PTR2(gradWeight, offset);
+        if (gradWeight->stride[0] == 1) {
+          THVector_(fill)(pGradWeight, 0, outDim);
+        } else {
+          long stride = gradWeight->stride[0];
+          for (j = 0; j < outDim; ++j) {
+            pGradWeight[j * stride] = 0;
+          }
+        }
+      } else {
+        THError(
+          "index out of bound. zeroGradParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+      }
+    }
+  }
+}
+
+#undef ROW_PTR2
+#undef COL_PTR2
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialAdaptiveAveragePooling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialAdaptiveAveragePooling.c
new file mode 100644
index 000000000..3675b42d7
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialAdaptiveAveragePooling.c
@@ -0,0 +1,258 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialAdaptiveAveragePooling.c"
+#else
+
+#define START_IND(a,b,c) (int)floor((float)(a * c) / b)
+#define END_IND(a,b,c) (int)ceil((float)((a + 1) * c) / b)
+// #define START_IND(a,b,c) a * c / b
+// #define END_IND(a,b,c)  (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0
+
+static void THNN_(SpatialAdaptiveAveragePooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          long nslices,
+          long iwidth,
+          long iheight,
+          long owidth,
+          long oheight,
+          long stridew,
+          long strideh,
+          long strided)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    long i, j;
+    for(i = 0; i < oheight; i++)
+    {
+      int y_start = START_IND(i, oheight, iheight);
+      int y_end   = END_IND(i, oheight, iheight);
+      int kH = y_end-y_start;
+
+      for(j = 0; j < owidth; j++)
+      {
+
+        int x_start = START_IND(j, owidth, iwidth);
+        int x_end   = END_IND(j, owidth, iwidth);
+        int kW = x_end-x_start;
+
+        /* local pointers */
+        real *ip = input_p   + k*strided + y_start*strideh + x_start*stridew;
+        real *op = output_p  + k*owidth*oheight + i*owidth + j;
+
+        /* compute local average: */
+        real sum = 0;
+        int x,y;
+        for(y = 0; y < kH; y++)
+        {
+          for(x = 0; x < kW; x++)
+          {
+            real val = *(ip + y*strideh + x*stridew);
+            sum += val;
+          }
+        }
+
+        /* set output to local average */
+        *op = sum / kW / kH;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialAdaptiveAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int owidth,
+          int oheight)
+{
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+
+  long istride_d;
+  long istride_h;
+  long istride_w;
+  long istride_b;
+
+  real *input_data;
+  real *output_data;
+
+
+  THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+		"3D or 4D (batch mode) tensor expected for input, but got: %s");
+
+  if (input->nDimension == 4)
+  {
+    istride_b = input->stride[0];
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  /* strides */
+  istride_d = input->stride[dimh-1];
+  istride_h = input->stride[dimh];
+  istride_w = input->stride[dimw];
+
+  /* resize output */
+  if (input->nDimension == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(SpatialAdaptiveAveragePooling_updateOutput_frame)(input_data, output_data,
+                                                      nslices,
+                                                      iwidth, iheight,
+                                                      owidth, oheight,
+                                                      istride_w,istride_h,
+                                                      istride_d);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialAdaptiveAveragePooling_updateOutput_frame)(input_data+p*istride_b, output_data+p*nslices*owidth*oheight,
+                                                        nslices,
+                                                        iwidth, iheight,
+                                                        owidth, oheight,
+                                                        istride_w,istride_h,
+                                                        istride_d);
+    }
+  }
+}
+
+static void THNN_(SpatialAdaptiveAveragePooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          long nslices,
+          long iwidth,
+          long iheight,
+          long owidth,
+          long oheight)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *gradInput_p_k = gradInput_p + k*iwidth*iheight;
+    real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
+
+    /* calculate average */
+    long i, j;
+    for(i = 0; i < oheight; i++)
+    {
+      int y_start = START_IND(i, oheight, iheight);
+      int y_end   = END_IND(i, oheight, iheight);
+      int kH = y_end-y_start;
+
+      for(j = 0; j < owidth; j++)
+      {
+
+        int x_start = START_IND(j, owidth, iwidth);
+        int x_end   = END_IND(j, owidth, iwidth);
+        int kW = x_end-x_start;
+
+        int x,y;
+        for(y = y_start; y < y_end; y++)
+        {
+          for(x = x_start; x < x_end; x++)
+          {
+            /* update gradient */
+            gradInput_p_k[y*iwidth + x] += gradOutput_p_k[i*owidth + j] / kW / kH;
+          }
+        }
+      }
+    }
+  }
+}
+
+void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput)
+{
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  int nslices;
+  int iheight;
+  int iwidth;
+  int oheight;
+  int owidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = gradOutput->size[dimh];
+  owidth = gradOutput->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+  /* backprop */
+  if (input->nDimension == 3)
+  {
+    THNN_(SpatialAdaptiveAveragePooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+                                                         nslices,
+                                                         iwidth, iheight,
+                                                         owidth, oheight);
+  }
+  else
+  {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialAdaptiveAveragePooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
+                                                           nslices,
+                                                           iwidth, iheight,
+                                                           owidth, oheight);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
+
+#undef START_IND
+#undef END_IND
+\ No newline at end of file
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialAdaptiveMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialAdaptiveMaxPooling.c
new file mode 100644
index 000000000..fff716e67
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialAdaptiveMaxPooling.c
@@ -0,0 +1,274 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialAdaptiveMaxPooling.c"
+#else
+
+static void THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          THIndex_t *indx_p,
+          THIndex_t *indy_p,
+          long nslices,
+          long iwidth,
+          long iheight,
+          long owidth,
+          long oheight,
+          long stridew,
+          long strideh,
+          long strided)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    long i, j;
+    for(i = 0; i < oheight; i++)
+    {
+      int y_start = (int)floor((float)i / oheight * iheight);
+      int y_end   = (int)ceil((float)(i + 1) / oheight * iheight);
+      int kH = y_end-y_start;
+
+      for(j = 0; j < owidth; j++)
+      {
+
+        int x_start = (int)floor((float)j / owidth * iwidth);
+        int x_end   = (int)ceil((float)(j + 1) / owidth * iwidth);
+        int kW = x_end-x_start;
+
+        /* local pointers */
+        real *ip = input_p   + k*strided + y_start*strideh + x_start*stridew;
+        real *op = output_p  + k*owidth*oheight + i*owidth + j;
+        THIndex_t *indyp = indy_p + k*owidth*oheight + i*owidth + j;
+        THIndex_t *indxp = indx_p + k*owidth*oheight + i*owidth + j;
+
+        /* compute local max: */
+        long maxindex = -1;
+        real maxval = -FLT_MAX;
+        long tcntr = 0;
+        int x,y;
+        for(y = 0; y < kH; y++)
+        {
+          for(x = 0; x < kW; x++)
+          {
+            real val = *(ip + y*strideh + x*stridew);
+            if (val > maxval)
+            {
+              maxval = val;
+              maxindex = tcntr;
+            }
+            tcntr++;
+          }
+        }
+
+        /* set output to local max */
+        *op = maxval;
+
+        /* store location of max (x,y) */
+        *indyp = (maxindex / kW) + TH_INDEX_BASE;
+        *indxp = (maxindex % kW) + TH_INDEX_BASE;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int owidth,
+          int oheight)
+{
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+
+  long istride_d;
+  long istride_h;
+  long istride_w;
+  long istride_b;
+
+  real *input_data;
+  real *output_data;
+  THIndex_t *indices_data;
+
+
+  THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+		"3D or 4D (batch mode) tensor expected for input, but got: %s");
+
+  if (input->nDimension == 4)
+  {
+    istride_b = input->stride[0];
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  /* strides */
+  istride_d = input->stride[dimh-1];
+  istride_h = input->stride[dimh];
+  istride_w = input->stride[dimw];
+
+  /* resize output */
+  if (input->nDimension == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+    /* indices will contain i,j locations for each output point */
+    THIndexTensor_(resize4d)(indices, 2, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+    THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(input_data, output_data,
+                                                      indices_data+nslices*owidth*oheight, indices_data,
+                                                      nslices,
+                                                      iwidth, iheight,
+                                                      owidth, oheight,
+                                                      istride_w,istride_h,
+                                                      istride_d);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+    /* indices will contain i,j locations for each output point */
+    THIndexTensor_(resize5d)(indices, 2, nbatch, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(input_data+p*istride_b, output_data+p*nslices*owidth*oheight,
+                                                        indices_data+(p+nbatch)*nslices*owidth*oheight, indices_data+p*nslices*owidth*oheight,
+                                                        nslices,
+                                                        iwidth, iheight,
+                                                        owidth, oheight,
+                                                        istride_w,istride_h,
+                                                        istride_d);
+    }
+  }
+}
+
+static void THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          THIndex_t *indx_p,
+          THIndex_t *indy_p,
+          long nslices,
+          long iwidth,
+          long iheight,
+          long owidth,
+          long oheight)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *gradInput_p_k = gradInput_p + k*iwidth*iheight;
+    real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
+    THIndex_t *indx_p_k = indx_p + k*owidth*oheight;
+    THIndex_t *indy_p_k = indy_p + k*owidth*oheight;
+
+    /* calculate max points */
+    long i, j;
+    for(i = 0; i < oheight; i++)
+    {
+      int y_start = (int)floor((float) i / oheight * iheight);
+      for(j = 0; j < owidth; j++)
+      {
+        int x_start = (int)floor((float) j / owidth * iwidth);
+        /* retrieve position of max */
+        long maxi = indy_p_k[i*owidth + j] - TH_INDEX_BASE + y_start;
+        long maxj = indx_p_k[i*owidth + j] - TH_INDEX_BASE + x_start;
+
+        /* update gradient */
+        gradInput_p_k[maxi*iwidth + maxj] += gradOutput_p_k[i*owidth + j];
+      }
+    }
+  }
+}
+
+void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices)
+{
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  int nslices;
+  int iheight;
+  int iwidth;
+  int oheight;
+  int owidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+  THIndex_t *indices_data;
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = gradOutput->size[dimh];
+  owidth = gradOutput->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THIndexTensor_(data)(indices);
+
+  /* backprop */
+  if (input->nDimension == 3)
+  {
+    THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+                                                         indices_data+nslices*owidth*oheight, indices_data,
+                                                         nslices,
+                                                         iwidth, iheight,
+                                                         owidth, oheight);
+  }
+  else
+  {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
+                                                           indices_data+(p+nbatch)*nslices*owidth*oheight, indices_data+p*nslices*owidth*oheight,
+                                                           nslices,
+                                                           iwidth, iheight,
+                                                           owidth, oheight);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialAveragePooling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialAveragePooling.c
new file mode 100644
index 000000000..c063502e7
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialAveragePooling.c
@@ -0,0 +1,329 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialAveragePooling.c"
+#else
+
+static inline void THNN_(SpatialAveragePooling_shapeCheck)(
+	THTensor *input, THTensor *gradOutput,
+	int kH, int kW, int dH, int dW, int padH, int padW,
+	bool ceil_mode) {
+
+  THArgCheck(kW > 0 && kH > 0, 5,
+             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 8,
+             "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+
+  int ndim = input->nDimension;
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
+		"3D or 4D input tensor expected but got: %s");
+
+  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2,
+	     "pad should be smaller than half of kernel size, but got "
+	     "padW = %d, padH = %d, kW = %d, kH = %d",
+	     padW, padH, kW, kH);
+
+  long nInputPlane = input->size[dimh-1];
+  long inputHeight = input->size[dimh];
+  long inputWidth = input->size[dimw];
+  long outputHeight, outputWidth;
+  long nOutputPlane = nInputPlane;
+
+  if(ceil_mode)
+  {
+    outputHeight = (long)(ceil((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+    outputWidth  = (long)(ceil((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+  }
+  else
+  {
+    outputHeight = (long)(floor((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+    outputWidth  = (long)(floor((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+  }
+
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
+  }
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%dx%dx%d). "
+	    "Calculated output size: (%dx%dx%d). Output size is too small",
+            nInputPlane,inputHeight,inputWidth,nInputPlane,outputHeight,outputWidth);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+void THNN_(SpatialAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          bool ceil_mode,
+          bool count_include_pad)
+{
+  real *output_data;
+  real *input_data;
+
+  int dimw = 2;
+  int dimh = 1;
+  int dimc = 0;
+  long nbatch = 1;
+
+  long inputWidth;
+  long inputHeight;
+  long outputWidth;
+  long outputHeight;
+  long nInputPlane; // number of channels (or colors)
+
+  long k;
+
+  THNN_(SpatialAveragePooling_shapeCheck)
+    (input, NULL, kH, kW, dH, dW, padH, padW, ceil_mode);
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimc++;
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  nInputPlane = input->size[dimc];
+
+  if(ceil_mode)
+  {
+    outputWidth  = (long)(ceil((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+    outputHeight = (long)(ceil((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+  }
+  else
+  {
+    outputWidth  = (long)(floor((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+    outputHeight = (long)(floor((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+  }
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
+  }
+
+  if (input->nDimension == 3)
+    THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
+  else
+    THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth);
+
+  input = THTensor_(newContiguous)(input);
+  THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
+  input_data = THTensor_(data)(input);
+  output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    long p;
+    for(p = 0; p < nbatch; p++)
+    {
+      long xx, yy;
+      /* For all output pixels... */
+      real *ptr_output = output_data + p*nInputPlane*outputWidth*outputHeight + k*outputWidth*outputHeight;
+      real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+      long i;
+      for(i = 0; i < outputWidth*outputHeight; i++)
+        ptr_output[i] = 0;
+
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          /* Compute the mean of the input image... */
+          long hstart = yy * dH - padH;
+          long wstart = xx * dW - padW;
+          long hend = fminf(hstart + kH, inputHeight + padH);
+          long wend = fminf(wstart + kW, inputWidth + padW);
+          int pool_size = (hend - hstart) * (wend - wstart);
+          hstart = fmaxf(hstart, 0);
+          wstart = fmaxf(wstart, 0);
+          hend = fminf(hend, inputHeight);
+          wend = fminf(wend, inputWidth);
+
+          real sum = 0;
+
+          int divide_factor;
+          if(count_include_pad)
+            divide_factor = pool_size;
+          else
+            divide_factor = (hend - hstart) * (wend - wstart);
+
+          long kx, ky;
+
+          for(ky = hstart; ky < hend; ky++)
+          {
+            for(kx = wstart; kx < wend; kx++)
+              sum += ptr_input[ky*inputWidth + kx];
+          }
+          /* Update output */
+          *ptr_output++ += sum/divide_factor;
+        }
+      }
+    }
+  }
+  THTensor_(free)(input);
+}
+
+void THNN_(SpatialAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          bool ceil_mode,
+          bool count_include_pad)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimc = 0;
+  long nbatch = 1;
+  long ndim = 3;
+
+  long inputWidth;
+  long inputHeight;
+  long outputWidth;
+  long outputHeight;
+  long nInputPlane; // number of channels (or colors)
+
+  real *gradOutput_data;
+  real *input_data, *gradInput_data;
+
+  long k;
+
+  THNN_(SpatialAveragePooling_shapeCheck)
+    (input, gradOutput, kH, kW, dH, dW, padH, padW, ceil_mode);
+
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimc++;
+    ndim = 4;
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  nInputPlane = input->size[dimc];
+
+  if(ceil_mode)
+  {
+    outputWidth  = (long)(ceil((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+    outputHeight = (long)(ceil((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+  }
+  else
+  {
+    outputWidth  = (long)(floor((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+    outputHeight = (long)(floor((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+  }
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
+  }
+
+  THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+  THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+
+  THTensor_(resizeAs)(gradInput, input);
+
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  THArgCheck(THTensor_(isContiguous)(gradInput), 4, "gradInput must be contiguous");
+
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    long p;
+    for(p = 0; p < nbatch; p++)
+    {
+      real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
+      long xx, yy;
+
+      real* ptr_gi = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+      real *ptr_gradInput = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+
+      long i;
+      for(i=0; i<inputWidth*inputHeight; i++)
+        ptr_gi[i] = 0.0;
+
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          long hstart = yy * dH - padH;
+          long wstart = xx * dW - padW;
+          long hend = fminf(hstart + kH, inputHeight + padH);
+          long wend = fminf(wstart + kW, inputWidth + padW);
+          int pool_size = (hend - hstart) * (wend - wstart);
+          hstart = fmaxf(hstart, 0);
+          wstart = fmaxf(wstart, 0);
+          hend = fminf(hend, inputHeight);
+          wend = fminf(wend, inputWidth);
+
+          real z = *ptr_gradOutput++;
+
+          int divide_factor;
+          if(count_include_pad)
+            divide_factor = pool_size;
+          else
+            divide_factor = (hend - hstart) * (wend - wstart);
+
+          long kx, ky;
+          for(ky = hstart ; ky < hend; ky++)
+          {
+            for(kx = wstart; kx < wend; kx++)
+              ptr_gradInput[ky*inputWidth + kx] += z/divide_factor;
+          }
+        }
+      }
+    }
+  }
+
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialClassNLLCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialClassNLLCriterion.c
new file mode 100644
index 000000000..d711c8590
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialClassNLLCriterion.c
@@ -0,0 +1,131 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialClassNLLCriterion.c"
+#else
+
+#define INITIAL_CHECK                                                            \
+  THArgCheck(THIndexTensor_(nDimension)(target) == 3, 3,                         \
+    "only batches of spatial targets supported (3D tensors)"		         \
+	     " but got targets of dimension: %d",			         \
+	     THIndexTensor_(nDimension)(target));			         \
+  THArgCheck(THTensor_(nDimension)(input) == 4, 2,			         \
+	     "only batches of spatial inputs supported (4D tensors), "	         \
+	     "but got input of dimension: %d", THTensor_(nDimension)(input));    \
+  if (weights && THTensor_(nElement)(weights) != THTensor_(size)(input, 1)) {    \
+    THError("weight tensor should be defined either for all or no classes");     \
+  }                                                                              \
+                                                                                 \
+  {                                                                              \
+    long input0 = THTensor_(size)(input, 0);                                     \
+    long input1 = THTensor_(size)(input, 1);                                     \
+    long input2 = THTensor_(size)(input, 2);                                     \
+    long input3 = THTensor_(size)(input, 3);                                     \
+    long target0 = THIndexTensor_(size)(target, 0);                              \
+    long target1 = THIndexTensor_(size)(target, 1);                              \
+    long target2 = THIndexTensor_(size)(target, 2);                              \
+    THAssertMsg(input0 == target0 && input2 == target1 && input3 == target2,     \
+              "size mismatch (got input: %ldx%ldx%ldx%ld, target: %ldx%ldx%ld)", \
+              input0, input1, input2, input3, target0, target1, target2);        \
+  }
+
+void THNN_(SpatialClassNLLCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          THTensor *weights,
+          THTensor *total_weight)
+{
+  INITIAL_CHECK;
+
+  input = THTensor_(newContiguous)(input);
+  target = THIndexTensor_(newContiguous)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+
+  real *input_data = THTensor_(data)(input);
+  THIndex_t *target_data = THIndexTensor_(data)(target);
+  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
+  real *output_data = THTensor_(data)(output);
+  real *total_weight_data = THTensor_(data)(total_weight);
+
+  long batch_size = THTensor_(size)(input, 0);
+  long n_classes = THTensor_(size)(input, 1);
+  long map_size = THTensor_(size)(input, 2) * THTensor_(size)(input, 3);
+  long sample_size = map_size * n_classes;
+
+  real total_weight_acc = 0;
+  real output_acc = 0;
+  for (int b = 0; b < batch_size; b++) {
+    for (int elem = 0; elem < map_size; elem++) {
+      int cur_target = target_data[b * map_size + elem] - TH_INDEX_BASE;
+      THAssert(cur_target >= 0 && cur_target < n_classes);
+
+      real cur_weight = weights ? weights_data[cur_target] : 1.0f;
+      total_weight_acc += cur_weight;
+      output_acc -= input_data[b * sample_size + cur_target * map_size + elem] * cur_weight;
+    }
+  }
+  *total_weight_data = total_weight_acc;
+  *output_data = output_acc;
+
+  if (sizeAverage && *total_weight_data)
+    *output_data /= *total_weight_data;
+
+  THTensor_(free)(input);
+  THIndexTensor_(free)(target);
+  if (weights)
+    THTensor_(free)(weights);
+}
+
+void THNN_(SpatialClassNLLCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          THTensor *weights,
+          THTensor *total_weight)
+{
+  INITIAL_CHECK;
+  THArgCheck(THTensor_(isContiguous)(gradInput), 4,
+              "gradInput must be contiguous");
+
+  real *total_weight_data = THTensor_(data)(total_weight);
+  if (*total_weight_data <= 0)
+    return;
+
+  target = THIndexTensor_(newContiguous)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+
+  THIndex_t *target_data = THIndexTensor_(data)(target);
+  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
+  real *gradInput_data = THTensor_(data)(gradInput);
+
+  long batch_size = THTensor_(size)(input, 0);
+  long n_classes = THTensor_(size)(input, 1);
+  long map_size = THTensor_(size)(input, 2) * THTensor_(size)(input, 3);
+  long sample_size = map_size * n_classes;
+
+  real normalize = sizeAverage ? *total_weight_data : 1.0f;
+
+  int b;
+  #pragma omp parallel for
+  for (b = 0; b < batch_size; b++) {
+    int elem;
+    for (elem = 0; elem < map_size; elem++) {
+      int cur_target = target_data[b * map_size + elem] - TH_INDEX_BASE;
+      THAssert(cur_target >= 0 && cur_target < n_classes);
+
+      gradInput_data[b * sample_size + cur_target * map_size + elem] =
+        -(weights ? weights_data[cur_target] : 1.0f) / normalize;
+    }
+  }
+
+  THIndexTensor_(free)(target);
+  if (weights)
+    THTensor_(free)(weights);
+}
+
+#undef INITIAL_CHECK
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionLocal.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionLocal.c
new file mode 100644
index 000000000..6db5a5db9
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionLocal.c
@@ -0,0 +1,367 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialConvolutionLocal.c"
+#else
+
+static inline void THNN_(SpatialConvolutionLocal_shapeCheck)(
+    THTensor *input, THTensor *gradOutput,
+    THTensor *weight, THTensor *bias,
+    int kH, int kW, int dH,
+    int dW, int padH, int padW,
+    long inputHeight, long inputWidth,
+    long outputHeight, long outputWidth) {
+
+  THArgCheck(kW > 0 && kH > 0, 9,
+           "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 11,
+         "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+
+  int ndim = input->nDimension;
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
+        "3D or 4D input tensor expected but got: %s");
+
+  long nInputPlane = weight->size[2] / (kH * kW);
+  long nOutputPlane = weight->size[1];
+
+  if (bias != NULL) {
+    THNN_CHECK_DIM_SIZE(bias, 3, 0, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(bias, 3, 1, outputHeight);
+    THNN_CHECK_DIM_SIZE(bias, 3, 2, outputWidth);
+  }
+
+  THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+static THTensor* THNN_(view_weight_local)(THTensor *_weight)
+{
+  THTensor *weight = THTensor_(newContiguous)(_weight);
+  THArgCheck(weight->nDimension == 3 || weight->nDimension == 6, 4,
+          "weight tensor should be 3D or 6D - got %dD", weight->nDimension);
+  if (weight->nDimension == 6) {
+    long s1 = weight->size[0] * weight->size[1];
+    long s2 = weight->size[2];
+    long s3 = weight->size[3] * weight->size[4] * weight->size[5];
+    THTensor *old_weight = weight;
+    weight = THTensor_(newWithStorage3d)(weight->storage,
+                       weight->storageOffset,
+                       s1, -1, s2, -1, s3, -1);
+    THTensor_(free)(old_weight);
+  }
+  return weight;
+}
+
+static void THNN_(SpatialConvolutionLocal_updateOutput_frame)
+     (
+      THTensor *input, THTensor *output,
+      THTensor *weight, THTensor *bias, THTensor *finput,
+      int kW, int kH, int dW, int dH, int padW, int padH,
+      long nInputPlane, long inputWidth, long inputHeight,
+      long nOutputPlane, long outputWidth, long outputHeight)
+{
+  long i;
+  THTensor *output3d, *finput3d;
+
+  THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH,
+               nInputPlane, inputWidth, inputHeight,
+               outputWidth, outputHeight);
+
+  THTensor_(copy)(output, bias);
+
+  output3d = THTensor_(newWithStorage3d)
+    (output->storage, output->storageOffset,
+     outputHeight * outputWidth, 1,
+     nOutputPlane, outputHeight * outputWidth,
+     1, nOutputPlane * outputHeight * outputWidth);
+
+  finput3d = THTensor_(newWithStorage3d)
+    (finput->storage, finput->storageOffset,
+     outputHeight * outputWidth, 1,
+     kW * kH * nInputPlane, outputHeight * outputWidth,
+     1, kW * kH * nInputPlane * outputHeight * outputWidth);
+
+  // weight:    oH*oW x nOutputPlane x nInputPlane*kH*kW
+  // finput3d:  oH*oW x nInputPlane*kH*kW x 1
+  THTensor_(baddbmm)(output3d, 1.0, output3d, 1.0, weight, finput3d);
+  // output3d:  oH*oW x nOutputPlane x 1
+
+  THTensor_(free)(output3d);
+  THTensor_(free)(finput3d);
+}
+
+void THNN_(SpatialConvolutionLocal_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *weight,
+    THTensor *bias,
+    THTensor *finput,
+    THTensor *fgradInput,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    long inputWidth, long inputHeight,
+    long outputWidth, long outputHeight)
+{
+  weight = THNN_(view_weight_local)(weight);
+
+  THNN_(SpatialConvolutionLocal_shapeCheck)
+    (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW,
+     inputHeight, inputWidth, outputHeight, outputWidth);
+
+  input = THTensor_(newContiguous)(input);
+
+  long nInputPlane = THTensor_(size)(weight, 2)/ (kW * kH);
+  long nOutputPlane = THTensor_(size)(weight, 1);
+
+  if(input->nDimension == 3)
+  {
+    THTensor_(resize2d)(finput, kW*kH*nInputPlane, outputHeight*outputWidth);
+    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+
+    THNN_(SpatialConvolutionLocal_updateOutput_frame)
+      (input, output, weight, bias, finput,
+       kW, kH, dW, dH, padW, padH,
+       nInputPlane, inputWidth, inputHeight,
+       nOutputPlane, outputWidth, outputHeight);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+    THTensor_(resize3d)(finput, T, kW*kH*nInputPlane, outputHeight*outputWidth);
+    THTensor_(resize4d)(output, T, nOutputPlane, outputHeight, outputWidth);
+
+#pragma omp parallel for private(t)
+    for(t = 0; t < T; t++)
+    {
+      THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+      THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      THNN_(SpatialConvolutionLocal_updateOutput_frame)
+    (input_t, output_t, weight, bias, finput_t,
+     kW, kH, dW, dH, padW, padH,
+     nInputPlane, inputWidth, inputHeight,
+     nOutputPlane, outputWidth, outputHeight);
+
+      THTensor_(free)(input_t);
+      THTensor_(free)(output_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(weight);
+}
+
+
+static void THNN_(SpatialConvolutionLocal_updateGradInput_frame)
+     (THTensor *gradInput, THTensor *gradOutput,
+      THTensor *weight, THTensor *fgradInput,
+      int kW, int kH, int dW, int dH, int padW, int padH,
+      long nInputPlane, long inputWidth, long inputHeight,
+      long nOutputPlane, long outputWidth, long outputHeight)
+{
+  THTensor *gradOutput3d, *fgradInput3d;
+  gradOutput3d = THTensor_(newWithStorage3d)(gradOutput->storage, gradOutput->storageOffset,
+                                             outputHeight*outputWidth, 1,
+                                             nOutputPlane, outputHeight*outputWidth,
+                                             1, nOutputPlane*outputHeight*outputWidth);
+  fgradInput3d = THTensor_(newWithStorage3d)(fgradInput->storage, fgradInput->storageOffset,
+                                             outputHeight*outputWidth, 1,
+                                             kW*kH*nInputPlane, outputHeight*outputWidth,
+                                             1, kW*kH*nInputPlane*outputHeight*outputWidth);
+  // weight:        oH*oW x nInputPlane*kH*kW x nOutputPlane
+  // gradOutput3d:  oH*oW x nOutputPlane x 1
+  THTensor_(baddbmm)(fgradInput3d, 0.0, fgradInput3d, 1.0, weight, gradOutput3d);
+  // fgradInput3d:  oH*oW x nInputPlane*kH*kW x 1
+
+  THTensor_(free)(gradOutput3d);
+  THTensor_(free)(fgradInput3d);
+
+  THTensor_(zero)(gradInput);
+
+  THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH, padW, padH,
+              nInputPlane, inputWidth, inputHeight,
+              outputWidth, outputHeight);
+
+}
+
+void THNN_(SpatialConvolutionLocal_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *weight,
+    THTensor *finput,
+    THTensor *fgradInput,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    long inputWidth, long inputHeight,
+    long outputWidth, long outputHeight)
+{
+  weight = THNN_(view_weight_local)(weight);
+
+  THNN_(SpatialConvolutionLocal_shapeCheck)
+    (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW,
+     inputHeight, inputWidth, outputHeight, outputWidth);
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  long nInputPlane = THTensor_(size)(weight,2)/(kW*kH);
+  long nOutputPlane = THTensor_(size)(weight,1);
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(resizeAs)(fgradInput, finput);
+
+  THTensor *tweight = THTensor_(new)();
+  THTensor_(transpose)(tweight, weight, 1, 2);
+
+  if(input->nDimension == 3)
+  {
+    THNN_(SpatialConvolutionLocal_updateGradInput_frame)
+      (gradInput, gradOutput, tweight,
+       fgradInput, kW, kH, dW, dH, padW, padH,
+       nInputPlane, inputWidth, inputHeight,
+       nOutputPlane, outputWidth, outputHeight);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+#pragma omp parallel for private(t)
+    for(t = 0; t < T; t++)
+    {
+      THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+      THNN_(SpatialConvolutionLocal_updateGradInput_frame)
+    (gradInput_t, gradOutput_t, tweight, fgradInput_t,
+     kW, kH, dW, dH, padW, padH,
+     nInputPlane, inputWidth, inputHeight,
+     nOutputPlane, outputWidth, outputHeight);
+
+      THTensor_(free)(gradInput_t);
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(fgradInput_t);
+    }
+  }
+
+  THTensor_(free)(tweight);
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
+}
+
+static void THNN_(SpatialConvolutionLocal_accGradParameters_frame)
+     (THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias,
+      THTensor *finput, real scale,
+      int kW, int kH, int dW, int dH, int padW, int padH,
+      long nInputPlane, long inputWidth, long inputHeight,
+      long nOutputPlane, long outputWidth, long outputHeight)
+{
+
+  THTensor *gradOutput3d, *finput3d;
+  gradOutput3d = THTensor_(newWithStorage3d)(gradOutput->storage, gradOutput->storageOffset,
+                                             outputHeight*outputWidth, 1,
+                                             nOutputPlane, outputHeight*outputWidth,
+                                             1, nOutputPlane*outputHeight*outputWidth);
+  finput3d = THTensor_(newWithStorage3d)(finput->storage, finput->storageOffset,
+                                         outputHeight*outputWidth, 1,
+                                         1, kW*kH*nInputPlane*outputHeight*outputWidth,
+                                         kW*kH*nInputPlane, outputHeight*outputWidth);
+  // gradOutput3d:  oH*oW x nOutputPlane x 1
+  // finput3d:      oH*oW x 1 x kW*kH*nInputPlane
+  THTensor_(baddbmm)(gradWeight, 1.0, gradWeight, scale, gradOutput3d, finput3d);
+  // gradWeight:    oH*oW x nOutputPlane x kW*kH*nInputPlane
+
+  THTensor_(cadd)(gradBias, gradBias, scale, gradOutput);
+
+  THTensor_(free)(gradOutput3d);
+  THTensor_(free)(finput3d);
+}
+
+void THNN_(SpatialConvolutionLocal_accGradParameters)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradWeight,
+    THTensor *gradBias,
+    THTensor *finput,
+    THTensor *fgradInput,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    long inputWidth, long inputHeight,
+    long outputWidth, long outputHeight,
+    accreal scale_)
+{
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  gradWeight = THNN_(view_weight_local)(gradWeight);
+
+  THNN_(SpatialConvolutionLocal_shapeCheck)
+    (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW,
+     inputHeight, inputWidth, outputHeight, outputWidth);
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  long nInputPlane = THTensor_(size)(gradWeight,2)/(kW*kH);
+  long nOutputPlane = THTensor_(size)(gradWeight,1);
+
+  if(input->nDimension == 3)
+  {
+    THNN_(SpatialConvolutionLocal_accGradParameters_frame)
+      (gradOutput, gradWeight, gradBias, finput, scale,
+       kW, kH, dW, dH, padW, padH,
+       nInputPlane, inputWidth, inputHeight,
+       nOutputPlane, outputWidth, outputHeight);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+    for(t = 0; t < T; t++)
+    {
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      THNN_(SpatialConvolutionLocal_accGradParameters_frame)
+    (gradOutput_t, gradWeight, gradBias, finput_t, scale,
+     kW, kH, dW, dH, padW, padH,
+     nInputPlane, inputWidth, inputHeight,
+     nOutputPlane, outputWidth, outputHeight);
+
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(gradWeight);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionMM.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionMM.c
new file mode 100644
index 000000000..28fea517c
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionMM.c
@@ -0,0 +1,377 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialConvolutionMM.c"
+#else
+
+static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
+	THTensor *input, THTensor *gradOutput,
+	THTensor *weight, THTensor *bias,
+	int kH, int kW, int dH, int dW, int padH, int padW) {
+
+  THArgCheck(kW > 0 && kH > 0, 9,
+	       "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 11,
+	     "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+  THNN_ARGCHECK(weight->nDimension == 2 || weight->nDimension == 4, 5, weight,
+		"2D or 4D weight tensor expected, but got: %s");
+
+  if (bias != NULL) {
+    THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+  }
+
+  int ndim = input->nDimension;
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
+		"3D or 4D input tensor expected but got: %s");
+
+  long nInputPlane  = weight->size[1] / (kH * kW);
+  long inputHeight  = input->size[dimh];
+  long inputWidth   = input->size[dimw];
+  long nOutputPlane = weight->size[0];
+  long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+  long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%d x %d x %d). "
+	    "Calculated output size: (%d x %d x %d). Output size is too small",
+	    nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
+
+  THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+static THTensor* THNN_(view_weight_MM2d)(THTensor *weight) {
+  weight = THTensor_(newContiguous)(weight);
+  if (weight->nDimension == 4) {
+    long s1 = weight->size[0];
+    long s2 = weight->size[1] * weight->size[2] * weight->size[3];
+    THTensor *old_weight = weight;
+    weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset,
+					 s1, -1, s2, -1);
+	THTensor_(free)(old_weight);
+  }
+  return weight;
+}
+
+static void THNN_(SpatialConvolutionMM_updateOutput_frame)(
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          long nInputPlane,
+          long inputWidth,
+          long inputHeight,
+          long nOutputPlane,
+          long outputWidth,
+          long outputHeight)
+{
+  long i;
+  THTensor *output2d;
+
+  THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH,
+		       nInputPlane, inputWidth, inputHeight,
+		       outputWidth, outputHeight);
+
+  output2d = THTensor_(newWithStorage2d)(output->storage, output->storageOffset,
+                                         nOutputPlane, -1,
+                                         outputHeight*outputWidth, -1);
+  if (bias) {
+    for(i = 0; i < nOutputPlane; i++)
+        THVector_(fill)
+	  (output->storage->data + output->storageOffset + output->stride[0] * i,
+	   THTensor_(get1d)(bias, i), outputHeight*outputWidth);
+  } else {
+    THTensor_(zero)(output);
+  }
+
+  THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput);
+
+  THTensor_(free)(output2d);
+}
+
+void THNN_(SpatialConvolutionMM_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
+{
+  weight = THNN_(view_weight_MM2d)(weight);
+
+  THNN_(SpatialConvolutionMM_shapeCheck)
+    (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW);
+
+  input = THTensor_(newContiguous)(input);
+  int ndim = input->nDimension;
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  long nInputPlane = input->size[dimf];
+  long inputHeight  = input->size[dimh];
+  long inputWidth   = input->size[dimw];
+  long nOutputPlane = weight->size[0];
+  long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+  long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+
+  if(input->nDimension == 3)
+  {
+    THTensor_(resize2d)(finput, kW*kH*nInputPlane, outputHeight*outputWidth);
+    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+
+    THNN_(SpatialConvolutionMM_updateOutput_frame)
+      (input, output, weight, bias, finput,
+       kW, kH, dW, dH, padW, padH,
+       nInputPlane, inputWidth, inputHeight,
+       nOutputPlane, outputWidth, outputHeight);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+    THTensor_(resize3d)(finput, T, kW*kH*nInputPlane, outputHeight*outputWidth);
+    THTensor_(resize4d)(output, T, nOutputPlane, outputHeight, outputWidth);
+
+#pragma omp parallel for private(t)
+    for(t = 0; t < T; t++)
+    {
+      THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+      THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      THNN_(SpatialConvolutionMM_updateOutput_frame)
+	(input_t, output_t, weight, bias, finput_t,
+	 kW, kH, dW, dH, padW, padH,
+	 nInputPlane, inputWidth, inputHeight,
+	 nOutputPlane, outputWidth, outputHeight);
+
+      THTensor_(free)(input_t);
+      THTensor_(free)(output_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(weight);
+}
+
+static void THNN_(SpatialConvolutionMM_updateGradInput_frame)(
+          THTensor *gradInput,
+          THTensor *gradOutput,
+          THTensor *weight,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
+{
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)
+    (gradOutput->storage, gradOutput->storageOffset,
+     gradOutput->size[0], -1,
+     gradOutput->size[1]*gradOutput->size[2], -1);
+  THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d);
+  THTensor_(free)(gradOutput2d);
+
+  THTensor_(zero)(gradInput);
+
+  THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH,
+		      padW, padH,
+		      gradInput->size[0], gradInput->size[2], gradInput->size[1],
+		      gradOutput->size[2], gradOutput->size[1]);
+}
+
+void THNN_(SpatialConvolutionMM_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
+{
+  weight = THNN_(view_weight_MM2d)(weight);
+
+  THNN_(SpatialConvolutionMM_shapeCheck)
+    (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW);
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(resizeAs)(fgradInput, finput);
+
+  // depending on the BLAS library, fgradInput (result tensor) might
+  // be left uninitialized on zero alpha, which might lead to weird behavior
+  // hence, to be safe, zero it
+  THTensor_(zero)(fgradInput);
+  THTensor *tweight = THTensor_(new)();
+  THTensor_(transpose)(tweight, weight, 0, 1);
+
+  if(input->nDimension == 3)
+  {
+    THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput, gradOutput,
+						      tweight, fgradInput,
+						      kW, kH, dW, dH, padW, padH);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+#pragma omp parallel for private(t)
+    for(t = 0; t < T; t++)
+    {
+      THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+      THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput_t, gradOutput_t,
+							tweight, fgradInput_t,
+							kW, kH, dW, dH, padW, padH);
+
+      THTensor_(free)(gradInput_t);
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(fgradInput_t);
+    }
+  }
+
+  THTensor_(free)(tweight);
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
+}
+
+static void THNN_(SpatialConvolutionMM_accGradParameters_frame)(
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          real scale)
+{
+  long i;
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)
+    (gradOutput->storage, gradOutput->storageOffset,
+     gradOutput->size[0], -1,
+     gradOutput->size[1]*gradOutput->size[2], -1);
+
+  THTensor *tfinput = THTensor_(new)();
+  THTensor_(transpose)(tfinput, finput, 0, 1);
+  THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, tfinput);
+  THTensor_(free)(tfinput);
+
+  if (gradBias) {
+    for(i = 0; i < gradBias->size[0]; i++)
+    {
+      long k;
+      real sum = 0;
+      real *data = gradOutput2d->storage->data + gradOutput2d->storageOffset + i*gradOutput2d->stride[0];
+      for(k = 0; k < gradOutput2d->size[1]; k++)
+        sum += data[k];
+      (gradBias->storage->data + gradBias->storageOffset)[i] += scale*sum;
+    }
+  }
+
+  THTensor_(free)(gradOutput2d);
+}
+
+void THNN_(SpatialConvolutionMM_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          accreal scale_)
+{
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+  if (gradBias)
+    THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
+
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  gradWeight = THNN_(view_weight_MM2d)(gradWeight);
+
+  THNN_(SpatialConvolutionMM_shapeCheck)
+    (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW);
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  if(input->nDimension == 3)
+  {
+    THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight,
+							gradBias, finput, scale);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+    for(t = 0; t < T; t++)
+    {
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight,
+							  gradBias, finput_t, scale);
+
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(gradWeight);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionMap.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionMap.c
new file mode 100644
index 000000000..142a03551
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionMap.c
@@ -0,0 +1,277 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialConvolutionMap.c"
+#else
+
+void THNN_(SpatialConvolutionMap_updateOutput)(
+  THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH)
+{
+  THArgCheck(
+    weight != NULL && weight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == weight->size[0], 4,
+    "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+  );
+
+  int dimw = 2;
+  int dimh = 1;
+  int dimc = 0;
+  long nbatch = 1;
+
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected");
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimc++;
+    dimw++;
+    dimh++;
+  }
+
+  const long kH       = weight->size[1];
+  const long kW       = weight->size[2];
+
+  THArgCheck(input->size[dimc] >= nInputPlane, 2, "invalid number of input planes");
+  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH, 2, "input image smaller than kernel size");
+
+  const long input_w  = input->size[dimw];
+  const long input_h  = input->size[dimh];
+  const long output_w = (input_w - kW) / dW + 1;
+  const long output_h = (input_h - kH) / dH + 1;
+
+  if (input->nDimension == 3)
+    THTensor_(resize3d)(output, nOutputPlane, output_h, output_w);
+  else
+    THTensor_(resize4d)(output, input->size[0], nOutputPlane, output_h, output_w);
+
+  /* contiguous */
+  input = THTensor_(newContiguous)(input);
+  output = THTensor_(newContiguous)(output);
+  weight = THTensor_(newContiguous)(weight);
+  bias = bias ? THTensor_(newContiguous)(bias) : bias;
+  connTable = THTensor_(newContiguous)(connTable);
+
+  /* get raw pointers */
+  real *input_data = THTensor_(data)(input);
+  real *output_data = THTensor_(data)(output);
+  real *weight_data = THTensor_(data)(weight);
+  real *bias_data = THTensor_(data)(bias);
+  real *connTable_data = THTensor_(data)(connTable);
+
+  long p;
+#pragma omp parallel for private(p)
+  for (p = 0; p < nOutputPlane; p++)
+  {
+    long m;
+    for (m = 0; m < nbatch; m++)
+    {
+      /* add bias */
+      real *ptr_output = output_data + p*output_w*output_h + m*nOutputPlane*output_w*output_h;
+      long j, k;
+      real z= bias_data[p];
+      for (j = 0; j < output_h*output_w; j++)
+        ptr_output[j] = z;
+
+      /* convolve all maps */
+      int nweight = connTable->size[0];
+      for (k = 0; k < nweight; k++)
+      {
+        /* get offsets for input/output */
+        int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
+        int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
+
+        if (o == p)
+        {
+          THTensor_(validXCorr2Dptr)(
+            output_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h,
+            1.0,
+            input_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, input_h, input_w,
+            weight_data + k*kW*kH,
+            kH, kW,
+            dH, dW
+          );
+        }
+      }
+    }
+  }
+
+  /* clean up */
+  THTensor_(free)(input);
+  THTensor_(free)(output);
+  THTensor_(free)(weight);
+  if (bias) THTensor_(free)(bias);
+  THTensor_(free)(connTable);
+}
+
+void THNN_(SpatialConvolutionMap_updateGradInput)(
+  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *weight, THTensor *bias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH)
+{
+  THArgCheck(
+    weight != NULL && weight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == weight->size[0], 5,
+    "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+  );
+
+  /* and dims */
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  const long input_h  = input->size[dimh];
+  const long input_w  = input->size[dimw];
+  const long output_h = gradOutput->size[dimh];
+  const long output_w = gradOutput->size[dimw];
+  const long kH       = weight->size[1];
+  const long kW       = weight->size[2];
+
+  /* contiguous */
+  gradInput = THTensor_(newContiguous)(gradInput);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  weight = THTensor_(newContiguous)(weight);
+  connTable = THTensor_(newContiguous)(connTable);
+
+  /* Resize/Zero */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* get raw pointers */
+  real *gradInput_data = THTensor_(data)(gradInput);
+  real *gradOutput_data = THTensor_(data)(gradOutput);
+  real *weight_data = THTensor_(data)(weight);
+  real *connTable_data = THTensor_(data)(connTable);
+
+  long p;
+#pragma omp parallel for private(p)
+  for (p = 0; p < nInputPlane; p++)
+  {
+    long m;
+    for (m = 0; m < nbatch; m++)
+    {
+      long k;
+      /* backward all */
+      int nkernel = connTable->size[0];
+      for (k = 0; k < nkernel; k++)
+      {
+        int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
+        int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
+        if (i == p)
+        {
+          /* gradient to input */
+          THTensor_(fullConv2Dptr)(
+            gradInput_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, 1.0,
+            gradOutput_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h,  output_h,  output_w,
+            weight_data + k*kW*kH, kH, kW, dH, dW
+          );
+        }
+      }
+    }
+  }
+
+  /* clean up */
+  THTensor_(free)(gradInput);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
+  THTensor_(free)(connTable);
+}
+
+void THNN_(SpatialConvolutionMap_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *connTable,
+          int nInputPlane,
+          int nOutputPlane,
+          int dW, int dH,
+          accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  THArgCheck(
+    gradWeight != NULL && gradWeight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5,
+    "3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+  );
+
+  /* and dims */
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  const long input_h  = input->size[dimh];
+  const long input_w  = input->size[dimw];
+  const long output_h = gradOutput->size[dimh];
+  const long output_w = gradOutput->size[dimw];
+  const long kH       = gradWeight->size[1];
+  const long kW       = gradWeight->size[2];
+
+  /* contiguous */
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
+
+  /* get raw pointers */
+  real *input_data = THTensor_(data)(input);
+  real *gradOutput_data = THTensor_(data)(gradOutput);
+  real *gradWeight_data = THTensor_(data)(gradWeight);
+  real *gradBias_data = THTensor_(data)(gradBias);
+
+
+  long k;
+  /* gradients wrt bias */
+#pragma omp parallel for private(k)
+  for (k = 0; k < nOutputPlane; k++)
+  {
+    long m;
+    for (m = 0; m < nbatch; m++)
+    {
+      real *ptr_gradOutput = gradOutput_data + k*output_w*output_h + m*nOutputPlane*output_w*output_h;
+      long l;
+      for (l = 0; l < output_h*output_w; l++)
+        gradBias_data[k] += scale*ptr_gradOutput[l];
+    }
+  }
+
+  /* gradients wrt weight */
+  const int nkernel = connTable->size[0];
+#pragma omp parallel for private(k)
+  for (k = 0; k < nkernel; k++)
+  {
+    long m;
+    for (m = 0; m < nbatch; m++)
+    {
+      int o = (int)THTensor_(get2d)(connTable,k,1) - TH_INDEX_BASE;
+      int i = (int)THTensor_(get2d)(connTable,k,0) - TH_INDEX_BASE;
+
+      /* gradient to kernel */
+      THTensor_(validXCorr2DRevptr)(
+        gradWeight_data + k*kW*kH,
+        scale,
+        input_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, input_h, input_w,
+        gradOutput_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h , output_h, output_w,
+        dH, dW
+      );
+    }
+  }
+
+  /* clean up */
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialDepthWiseConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialDepthWiseConvolution.c
new file mode 100644
index 000000000..efb66a3e3
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialDepthWiseConvolution.c
@@ -0,0 +1,528 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialDepthWiseConvolution.c"
+#else
+
+static inline void THNN_(SpatialDepthWiseConvolution_shapeCheck)(
+	THTensor *input, THTensor *gradOutput,
+	THTensor *weight, THTensor *bias,
+	int kH, int kW, int dH, int dW, int padH, int padW) {
+
+  THArgCheck(kW > 0 && kH > 0, 9,
+	       "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 11,
+	     "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+  THNN_ARGCHECK(weight->nDimension == 4, 5, weight,
+		"2D or 4D weight tensor expected, but got: %s");
+
+  if (bias != NULL) {
+    THNN_CHECK_DIM_SIZE(bias, 2, 0, weight->size[0]);
+    THNN_CHECK_DIM_SIZE(bias, 2, 1, weight->size[1]);
+  }
+
+  int ndim = input->nDimension;
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
+		"3D or 4D input tensor expected but got: %s");
+
+  long nInputPlane  = weight->size[1];
+  long inputHeight  = input->size[dimh];
+  long inputWidth   = input->size[dimw];
+  long nOutputPlane = weight->size[0];
+  long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+  long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%d x %d x %d). "
+	    "Calculated output size: (%d x %d x %d). Output size is too small",
+	    nInputPlane,inputHeight,inputWidth,nOutputPlane*nInputPlane,outputHeight,outputWidth);
+
+  THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim + 1, dimf, nInputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim + 1, dimh, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim + 1, dimw, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim + 1, dimw + 1, outputWidth);
+  }
+}
+
+static void THNN_(SpatialDepthWiseConvolution_updateOutput_frame)(
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          long nInputPlane,
+          long inputWidth,
+          long inputHeight,
+          long nOutputPlane,
+          long outputWidth,
+          long outputHeight)
+{
+  long i;
+  THTensor *output2d;
+
+  THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH,
+		       nInputPlane, inputWidth, inputHeight,
+		       outputWidth, outputHeight);
+
+  output2d = THTensor_(newWithStorage2d)(output->storage, output->storageOffset,
+                                         nOutputPlane, -1,
+                                         outputHeight*outputWidth, -1);
+  if (bias) {
+    for(i = 0; i < nOutputPlane; i++)
+        THVector_(fill)
+	  (output->storage->data + output->storageOffset + output->stride[0] * i,
+	   THTensor_(get1d)(bias, i), outputHeight*outputWidth);
+  } else {
+    THTensor_(zero)(output);
+  }
+
+  THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput);
+
+  THTensor_(free)(output2d);
+}
+
+void THNN_(SpatialDepthWiseConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
+{
+  long nInputPlane = weight->nDimension == 2 ? weight->size[1]/(kH*kW) : weight->size[1];
+  long nOutputPlane = weight->size[0];
+  if (weight->nDimension == 2) {
+    THTensor_(resize4d)(weight, nOutputPlane, nInputPlane, kH, kW);
+  }
+
+  THNN_(SpatialDepthWiseConvolution_shapeCheck)
+    (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW);
+
+  THTensor *_weight = THTensor_(newTranspose)(weight, 0, 1);
+  weight = THTensor_(newContiguous)(_weight);
+
+  THTensor *_bias = NULL;
+  if(bias) {
+  	_bias = THTensor_(newTranspose)(bias, 0, 1);
+  	bias = THTensor_(newContiguous)(_bias);
+  }
+
+  // resize weight
+  long s1 = weight->size[0];
+  long s2 = weight->size[1];
+  long s3 = weight->size[2] * weight->size[3];
+  weight = THTensor_(newWithStorage3d)(weight->storage, weight->storageOffset,
+          s1, -1, s2, -1, s3, -1);
+
+  input = THTensor_(newContiguous)(input);
+
+  int ndim = input->nDimension;
+
+  int batch = 1;
+  if (ndim == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+  }
+
+  long inputHeight  = input->size[3];
+  long inputWidth   = input->size[2];
+  long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+  long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+
+  long T = input->size[0];
+  long t;
+
+  THTensor_(resize5d)(output, T, nInputPlane, nOutputPlane, outputHeight, outputWidth);
+  THTensor_(resize4d)(finput, T, nInputPlane, kW*kH*1, outputHeight*outputWidth);
+
+#pragma omp parallel for private(t)
+  for(t = 0; t < T; t++)
+  {
+    THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+    THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+    THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+    long i;
+#pragma omp parallel for private(i)
+    for(i = 0; i < nInputPlane; i++)
+    {
+      THTensor *weight_i = THTensor_(newSelect)(weight, 0, i);
+      THTensor *input_i = THTensor_(newNarrow)(input_t, 0, i, 1);
+      THTensor *output_i = THTensor_(newSelect)(output_t, 0, i);
+      THTensor *finput_i = THTensor_(newSelect)(finput_t, 0, i);
+      THTensor *bias_i = NULL;
+      if(bias) {
+        bias_i = THTensor_(newSelect)(bias, 0, i);
+      }
+      THNN_(SpatialDepthWiseConvolution_updateOutput_frame)
+	(input_i, output_i, weight_i, bias_i, finput_i,
+	 kW, kH, dW, dH, padW, padH,
+	 1, inputWidth, inputHeight,
+	 nOutputPlane, outputWidth, outputHeight);
+
+      THTensor_(free)(input_i);
+      THTensor_(free)(weight_i);
+      THTensor_(free)(bias_i);
+      THTensor_(free)(output_i);
+      THTensor_(free)(finput_i);
+    }
+    THTensor_(free)(input_t);
+    THTensor_(free)(output_t);
+    THTensor_(free)(finput_t);
+  }
+
+  THTensor_(free)(weight);
+  THTensor_(free)(_weight);
+  THTensor_(free)(bias);
+  THTensor_(free)(_bias);
+  THTensor_(resize4d)(output, T, nInputPlane * nOutputPlane, outputHeight, outputWidth);
+
+  if (batch == 0) {
+    THTensor_(select)(output, NULL, 0, 0);
+    THTensor_(select)(input, NULL, 0, 0);
+    THTensor_(select)(finput, NULL, 0, 0);
+  }
+  THTensor_(free)(input);
+}
+
+static void THNN_(SpatialDepthWiseConvolution_updateGradInput_frame)(
+          THTensor *gradInput,
+          THTensor *gradOutput,
+          THTensor *weight,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
+{
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)
+    (gradOutput->storage, gradOutput->storageOffset,
+     gradOutput->size[0], -1,
+     gradOutput->size[1]*gradOutput->size[2], -1);
+  THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d);
+  THTensor_(free)(gradOutput2d);
+
+  THTensor_(zero)(gradInput);
+
+  THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH,
+		      padW, padH,
+		      gradInput->size[0], gradInput->size[2], gradInput->size[1],
+		      gradOutput->size[2], gradOutput->size[1]);
+}
+
+void THNN_(SpatialDepthWiseConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
+{
+  long nInputPlane = weight->nDimension == 2 ? weight->size[1]/(kH*kW) : weight->size[1];
+  long nOutputPlane = weight->size[0];
+  if (weight->nDimension == 2) {
+    THTensor_(resize4d)(weight, nOutputPlane, nInputPlane, kH, kW);
+  }
+  gradOutput = THTensor_(newWithTensor)(gradOutput);
+
+  if (input->nDimension == 3) {
+    if (gradOutput->nDimension == 3) {
+      THTensor_(resize4d)(gradOutput, nInputPlane, nOutputPlane, gradOutput->size[1], gradOutput->size[2]);
+    }
+  }
+  else
+  {
+    if (gradOutput->nDimension == 4) {
+      THTensor_(resize5d)(gradOutput, gradOutput->size[0], nInputPlane, nOutputPlane, gradOutput->size[2], gradOutput->size[3]);
+    }
+  }
+
+
+  THNN_(SpatialDepthWiseConvolution_shapeCheck)
+    (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW);
+
+  THTensor *_weight = THTensor_(newTranspose)(weight, 0, 1);
+  weight = THTensor_(newContiguous)(_weight);
+
+
+  // resize weight
+  long s1 = weight->size[0];
+  long s2 = weight->size[1];
+  long s3 = weight->size[2] * weight->size[3];
+  weight = THTensor_(newWithStorage3d)(weight->storage, weight->storageOffset,
+          s1, -1, s2, -1, s3, -1);
+
+  input = THTensor_(newContiguous)(input);
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  long inputHeight  = input->size[3];
+  long inputWidth   = input->size[2];
+  long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+  long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+
+  long T = input->size[0];
+  long t;
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(resize4d)(fgradInput, T, nInputPlane, kW*kH*1, outputHeight*outputWidth);
+
+  // depending on the BLAS library, fgradInput (result tensor) might
+  // be left uninitialized on zero alpha, which might lead to weird behavior
+  // hence, to be safe, zero it
+  THTensor_(zero)(fgradInput);
+
+
+
+#pragma omp parallel for private(t)
+  for(t = 0; t < T; t++)
+  {
+    THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+    THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+    THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+
+    long i;
+#pragma omp parallel for private(i)
+    for(i = 0; i < nInputPlane; i++)
+    {
+      THTensor *weight_i = THTensor_(newSelect)(weight, 0, i);
+      THTensor *gradInput_i = THTensor_(newNarrow)(gradInput_t, 0, i, 1);
+      THTensor *gradOutput_i = THTensor_(newSelect)(gradOutput_t, 0, i);
+      THTensor *fgradInput_i = THTensor_(newSelect)(fgradInput_t, 0, i);
+
+      THTensor_(transpose)(weight_i, weight_i, 0, 1);
+
+      THNN_(SpatialDepthWiseConvolution_updateGradInput_frame)(gradInput_i, gradOutput_i,
+              weight_i, fgradInput_i,
+              kW, kH, dW, dH, padW, padH);
+
+      THTensor_(free)(gradInput_i);
+      THTensor_(free)(weight_i);
+      THTensor_(free)(gradOutput_i);
+      THTensor_(free)(fgradInput_i);
+    }
+
+    THTensor_(free)(gradInput_t);
+    THTensor_(free)(gradOutput_t);
+    THTensor_(free)(fgradInput_t);
+  }
+
+  if (batch == 0) {
+    THTensor_(select)(gradOutput, NULL, 0, 0);
+    THTensor_(select)(input, NULL, 0, 0);
+    THTensor_(select)(gradInput, NULL, 0, 0);
+    THTensor_(select)(fgradInput, NULL, 0, 0);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
+  THTensor_(free)(_weight);
+}
+
+static void THNN_(SpatialDepthWiseConvolution_accGradParameters_frame)(
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          accreal scale)
+{
+  long i;
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)
+    (gradOutput->storage, gradOutput->storageOffset,
+     gradOutput->size[0], -1,
+     gradOutput->size[1]*gradOutput->size[2], -1);
+
+  THTensor_(transpose)(finput, finput, 0, 1);
+  THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, finput);
+  THTensor_(transpose)(finput, finput, 0, 1);
+
+  if (gradBias) {
+    for(i = 0; i < gradBias->size[0]; i++)
+    {
+      long k;
+      real sum = 0;
+      real *data = gradOutput2d->storage->data + gradOutput2d->storageOffset + i*gradOutput2d->stride[0];
+      for(k = 0; k < gradOutput2d->size[1]; k++)
+        sum += data[k];
+      (gradBias->storage->data + gradBias->storageOffset)[i] += scale*sum;
+    }
+  }
+
+  THTensor_(free)(gradOutput2d);
+}
+
+void THNN_(SpatialDepthWiseConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          accreal scale)
+{
+  long nInputPlane = gradWeight->nDimension == 2 ? gradWeight->size[1]/(kH*kW) : gradWeight->size[1];
+  long nOutputPlane = gradWeight->size[0];
+  if (gradWeight->nDimension == 2) {
+    THTensor_(resize4d)(gradWeight, nOutputPlane, nInputPlane, kH, kW);
+  }
+
+  gradOutput = THTensor_(newWithTensor)(gradOutput);
+  if (input->nDimension == 3) {
+    if (gradOutput->nDimension == 3) {
+      THTensor_(resize4d)(gradOutput, nInputPlane, nOutputPlane, gradOutput->size[1], gradOutput->size[2]);
+    }
+  }
+  else
+  {
+    if (gradOutput->nDimension == 4) {
+      THTensor_(resize5d)(gradOutput, gradOutput->size[0], nInputPlane, nOutputPlane, gradOutput->size[2], gradOutput->size[3]);
+    }
+  }
+
+
+  THNN_(SpatialDepthWiseConvolution_shapeCheck)
+    (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW);
+
+  // Transpose gradWeight & gradBias
+  THTensor_(transpose)(gradWeight, NULL, 0, 1);
+  THTensor *_gradWeight;
+  _gradWeight = gradWeight;
+  gradWeight = THTensor_(newContiguous)(gradWeight);
+
+  THTensor *_gradBias = NULL;
+  if(gradBias) {
+	  THTensor_(transpose)(gradBias, NULL, 0, 1);
+	  _gradBias = gradBias;
+	  gradBias = THTensor_(newContiguous)(gradBias);
+  }
+
+  // resize gradWeight
+  long s1 = gradWeight->size[0];
+  long s2 = gradWeight->size[1];
+  long s3 = gradWeight->size[2] * gradWeight->size[3];
+  gradWeight = THTensor_(newWithStorage3d)(gradWeight->storage, gradWeight->storageOffset,
+          s1, -1, s2, -1, s3, -1);
+
+  input = THTensor_(newContiguous)(input);
+
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  long inputHeight  = input->size[3];
+  long inputWidth   = input->size[2];
+  long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+  long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+
+  long T = input->size[0];
+  long t;
+  THTensor_(resize4d)(finput, T, nInputPlane, kW*kH*1, outputHeight*outputWidth);
+
+  for(t = 0; t < T; t++)
+  {
+    THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+    THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+    long i;
+#pragma omp parallel for private(i)
+    for(i = 0; i < nInputPlane; i++)
+    {
+      THTensor *finput_i = THTensor_(newSelect)(finput_t, 0, i);
+      THTensor *gradOutput_i = THTensor_(newSelect)(gradOutput_t, 0, i);
+      THTensor *gradWeight_i = THTensor_(newSelect)(gradWeight, 0, i);
+      THTensor *gradBias_i = NULL;
+      if(gradBias) {
+      	gradBias_i = THTensor_(newSelect)(gradBias, 0, i);
+      }
+      THNN_(SpatialDepthWiseConvolution_accGradParameters_frame)(gradOutput_i, gradWeight_i,
+                gradBias_i, finput_i, scale);
+
+      THTensor_(free)(finput_i);
+      THTensor_(free)(gradOutput_i);
+      THTensor_(free)(gradWeight_i);
+      THTensor_(free)(gradBias_i);
+    }
+
+    THTensor_(free)(gradOutput_t);
+    THTensor_(free)(finput_t);
+  }
+
+  // Copy back and transpose back
+  THTensor_(transpose)(_gradWeight, NULL, 0, 1);
+  THTensor_(resize4d)(_gradWeight, nInputPlane, nOutputPlane, kH, kW);
+  THTensor_(copy)(_gradWeight, gradWeight);
+  THTensor_(transpose)(_gradWeight, NULL, 0, 1);
+
+  if(gradBias) {
+	  THTensor_(transpose)(_gradBias, NULL, 0, 1);
+	  THTensor_(resize2d)(_gradBias, nInputPlane, nOutputPlane);
+	  THTensor_(copy)(_gradBias, gradBias);
+	  THTensor_(transpose)(_gradBias, NULL, 0, 1);
+  }
+
+  if (batch == 0) {
+    THTensor_(select)(gradOutput, NULL, 0, 0);
+    THTensor_(select)(input, NULL, 0, 0);
+    THTensor_(select)(finput, NULL, 0, 0);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(gradWeight);
+  THTensor_(free)(gradBias);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialDilatedConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialDilatedConvolution.c
new file mode 100644
index 000000000..897cc0da4
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialDilatedConvolution.c
@@ -0,0 +1,408 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialDilatedConvolution.c"
+#else
+
+static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
+	THTensor *input, THTensor *gradOutput,
+	THTensor *weight, THTensor *bias,
+	int kH, int kW, int dH, int dW, int padH, int padW,
+	int dilationH, int dilationW) {
+
+  THNN_ARGCHECK(weight->nDimension == 4, 4, weight,
+                "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
+                "but got: %s");
+  THArgCheck(kW > 0 && kH > 0, 9,
+             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 11,
+             "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+  THArgCheck(dilationW > 0 && dilationH > 0, 15,
+             "dilation should be greater than zero, but got dilationH: %d, dilationW: %d",
+             dilationH, dilationW);
+
+  if (bias != NULL) {
+    THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+  }
+
+  int ndim = input->nDimension;
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
+		"3D or 4D input tensor expected but got: %s");
+
+  long nInputPlane  = weight->size[1];
+  long inputHeight  = input->size[dimh];
+  long inputWidth   = input->size[dimw];
+  long nOutputPlane = weight->size[0];
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%ld x %ld x %ld). "
+	    "Calculated output size: (%ld x %ld x %ld). Output size is too small",
+	    nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
+
+  THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+void THNN_(SpatialDilatedConvolution_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *weight,
+    THTensor *bias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH)
+{
+
+  THNN_(SpatialDilatedConvolution_shapeCheck)
+    (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW,
+     dilationH, dilationW);
+
+  // Params:
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  input = THTensor_(newContiguous)(input);
+  weight = THTensor_(newContiguous)(weight);
+  bias = bias ? THTensor_(newContiguous)(bias) : bias;
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+  }
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth);
+  THTensor_(zero)(output);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize2d)(ones, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *output_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(output_n, output, 0, elt);
+
+    // Do Bias first:
+    // M,N,K are dims of matrix A and B
+    long m_ = nOutputPlane;
+    long n_ = outputHeight * outputWidth;
+    long k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    if (bias) {
+      THBlas_(gemm)(
+        't', 'n',
+        n_, m_, k_,
+        1,
+        THTensor_(data)(ones), k_,
+        THTensor_(data)(bias), k_,
+        0,
+        THTensor_(data)(output_n), n_
+      );
+    } else {
+      THTensor_(zero)(output_n);
+    }
+
+    // Extract columns:
+    THNN_(im2col)(
+      THTensor_(data)(input_n),
+      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+      dilationH, dilationW,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    long m = nOutputPlane;
+    long n = columns->size[1];
+    long k = nInputPlane*kH*kW;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+      'n', 'n',
+      n, m, k,
+      1,
+      THTensor_(data)(columns), n,
+      THTensor_(data)(weight), k,
+      1,
+      THTensor_(data)(output_n), n
+    );
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(output_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(weight);
+  if (bias) THTensor_(free)(bias);
+}
+
+void THNN_(SpatialDilatedConvolution_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *weight,
+    THTensor *gradColumns,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH)
+{
+  THNN_(SpatialDilatedConvolution_shapeCheck)
+    (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW,
+     dilationH, dilationW);
+
+  // Params
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  input = THTensor_(newContiguous)(input);
+  weight = THTensor_(newContiguous)(weight);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1],
+			gradOutput->size[2]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth);
+  THTensor_(zero)(gradColumns);
+
+  // Helpers
+  THTensor *gradInput_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THTensor_(select)(gradInput_n, gradInput, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    long m = nInputPlane*kW*kH;
+    long n = gradColumns->size[1];
+    long k = nOutputPlane;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        'n', 't',
+        n, m, k,
+        1,
+        THTensor_(data)(gradOutput_n), n,
+        THTensor_(data)(weight), m,
+        0,
+        THTensor_(data)(gradColumns), n
+    );
+
+    // Unpack columns back into input:
+    THNN_(col2im)(
+      THTensor_(data)(gradColumns),
+      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+      dilationH, dilationW,
+      THTensor_(data)(gradInput_n)
+    );
+  }
+
+  // Free
+  THTensor_(free)(gradInput_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+    THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
+}
+
+
+void THNN_(SpatialDilatedConvolution_accGradParameters)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradWeight,
+    THTensor *gradBias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH,
+    accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  THNN_(SpatialDilatedConvolution_shapeCheck)
+    (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW,
+     dilationH, dilationW);
+
+  // Params
+  int nInputPlane = gradWeight->size[1];
+  int nOutputPlane = gradWeight->size[0];
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+  if (gradBias)
+    THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0],
+			gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize2d)(ones, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    THNN_(im2col)(
+      THTensor_(data)(input_n),
+      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+      dilationH, dilationW,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    long m = nOutputPlane;
+    long n = nInputPlane*kW*kH;
+    long k = columns->size[1];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        't', 'n',
+        n, m, k,
+        scale,
+        THTensor_(data)(columns), k,
+        THTensor_(data)(gradOutput_n), k,
+        1,
+        THTensor_(data)(gradWeight), n
+    );
+
+    // Do Bias:
+    // M,N,K are dims of matrix A and B
+    long m_ = nOutputPlane;
+    long k_ = outputHeight * outputWidth;
+
+    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+    if (gradBias) {
+      THBlas_(gemv)(
+          't',
+          k_, m_,
+          scale,
+          THTensor_(data)(gradOutput_n), k_,
+          THTensor_(data)(ones), 1,
+          1,
+          THTensor_(data)(gradBias), 1
+      );
+    }
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize
+  if (batch == 0) {
+    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialDilatedMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialDilatedMaxPooling.c
new file mode 100644
index 000000000..8f4ad13c3
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialDilatedMaxPooling.c
@@ -0,0 +1,401 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialDilatedMaxPooling.c"
+#else
+
+static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)(
+	THTensor *input, THTensor *gradOutput, THIndexTensor *indices,
+	int kH, int kW, int dH, int dW, int padH, int padW,
+	int dilationH, int dilationW, bool ceil_mode) {
+
+  THArgCheck(kW > 0 && kH > 0, 5,
+             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 8,
+             "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+  THArgCheck(dilationH > 0 && dilationW > 0, 12,
+             "dilation should be greater than zero, but got dilationH: %d dilationW: %d",
+             dilationH, dilationW);
+
+  int ndim = input->nDimension;
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
+		"3D or 4D input tensor expected but got: %s");
+
+  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2,
+	     "pad should be smaller than half of kernel size, but got "
+	     "padW = %d, padH = %d, kW = %d, kH = %d",
+	     padW, padH, kW, kH);
+
+  long nInputPlane = input->size[dimh-1];
+  long inputHeight = input->size[dimh];
+  long inputWidth = input->size[dimw];
+  long outputHeight, outputWidth;
+  long nOutputPlane = nInputPlane;
+
+  if (ceil_mode)
+  {
+    outputHeight = (long)(ceil((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+    outputWidth  = (long)(ceil((float)(inputWidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
+  }
+  else
+  {
+    outputHeight = (long)(floor((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+    outputWidth  = (long)(floor((float)(inputWidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
+  }
+
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
+  }
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%dx%dx%d). "
+	    "Calculated output size: (%dx%dx%d). Output size is too small",
+            nInputPlane,inputHeight,inputWidth,nInputPlane,outputHeight,outputWidth);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+  if (indices != NULL) {
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimw, outputWidth);
+  }
+}
+
+static void THNN_(SpatialDilatedMaxPooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          THIndex_t *ind_p,
+          long nslices,
+          long iwidth,
+          long iheight,
+          long owidth,
+          long oheight,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          int dilationW,
+          int dilationH
+          )
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    long i, j;
+    real *ip = input_p   + k*iwidth*iheight;
+    for(i = 0; i < oheight; i++)
+    {
+      for(j = 0; j < owidth; j++)
+      {
+        long hstart = i * dH - padH;
+        long wstart = j * dW - padW;
+        long hend = fminf(hstart + (kH - 1) * dilationH + 1, iheight);
+        long wend = fminf(wstart + (kW - 1) * dilationW + 1, iwidth);
+        while(hstart < 0)
+          hstart += dilationH;
+        while(wstart < 0)
+          wstart += dilationW;
+
+        /* local pointers */
+        real *op = output_p  + k*owidth*oheight + i*owidth + j;
+        THIndex_t *indp = ind_p   + k*owidth*oheight + i*owidth + j;
+
+        /* compute local max: */
+        long maxindex = -1;
+        real maxval = -THInf;
+        long tcntr = 0;
+        long x,y;
+        for(y = hstart; y < hend; y += dilationH)
+        {
+          for(x = wstart; x < wend; x += dilationW)
+          {
+            tcntr = y*iwidth + x;
+            real val = *(ip + tcntr);
+            if (val > maxval)
+            {
+              maxval = val;
+              maxindex = tcntr;
+            }
+          }
+        }
+
+        /* set output to local max */
+        *op = maxval;
+
+        /* store location of max */
+        *indp = maxindex + TH_INDEX_BASE;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialDilatedMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          int dilationW,
+          int dilationH,
+          bool ceil_mode)
+{
+
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  long nInputPlane;
+  long inputHeight;
+  long inputWidth;
+  long outputHeight;
+  long outputWidth;
+  real *input_data;
+  real *output_data;
+  THIndex_t *indices_data;
+
+  THNN_(SpatialDilatedMaxPooling_shapeCheck)
+    (input, NULL, NULL, kH, kW, dH, dW,
+     padH, padW, dilationH, dilationW, ceil_mode);
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nInputPlane = input->size[dimh-1];
+  inputHeight = input->size[dimh];
+  inputWidth = input->size[dimw];
+  if (ceil_mode)
+  {
+    outputHeight = (long)(ceil((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+    outputWidth  = (long)(ceil((float)(inputWidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
+  }
+  else
+  {
+    outputHeight = (long)(floor((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+    outputWidth  = (long)(floor((float)(inputWidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
+  }
+
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
+  }
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  /* resize output */
+  if (input->nDimension == 3)
+  {
+    THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
+    /* indices will contain the locations for each output point */
+    THIndexTensor_(resize3d)(indices,  nInputPlane, outputHeight, outputWidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+    THNN_(SpatialDilatedMaxPooling_updateOutput_frame)
+      (input_data, output_data,
+       indices_data,
+       nInputPlane,
+       inputWidth, inputHeight,
+       outputWidth, outputHeight,
+       kW, kH, dW, dH,
+       padW, padH,
+       dilationW, dilationH
+       );
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize4d)(output, nbatch, nInputPlane, outputHeight, outputWidth);
+    /* indices will contain the locations for each output point */
+    THIndexTensor_(resize4d)(indices, nbatch, nInputPlane, outputHeight, outputWidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialDilatedMaxPooling_updateOutput_frame)
+	(input_data+p*nInputPlane*inputWidth*inputHeight,
+	 output_data+p*nInputPlane*outputWidth*outputHeight,
+	 indices_data+p*nInputPlane*outputWidth*outputHeight,
+	 nInputPlane,
+	 inputWidth, inputHeight,
+	 outputWidth, outputHeight,
+	 kW, kH, dW, dH,
+	 padW, padH,
+	 dilationW, dilationH
+	 );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          THIndex_t *ind_p,
+          long nInputPlane,
+          long inputWidth,
+          long inputHeight,
+          long outputWidth,
+          long outputHeight,
+          int dW,
+          int dH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nInputPlane; k++)
+  {
+    real *gradInput_p_k = gradInput_p + k*inputWidth*inputHeight;
+    real *gradOutput_p_k = gradOutput_p + k*outputWidth*outputHeight;
+    THIndex_t *ind_p_k = ind_p + k*outputWidth*outputHeight;
+
+    /* calculate max points */
+    long i, j;
+    for(i = 0; i < outputHeight; i++)
+    {
+      for(j = 0; j < outputWidth; j++)
+      {
+        /* retrieve position of max */
+        long maxp = ind_p_k[i*outputWidth + j] - TH_INDEX_BASE;
+	if (maxp != -1) {
+	  /* update gradient */
+	  gradInput_p_k[maxp] += gradOutput_p_k[i*outputWidth + j];
+	}
+      }
+    }
+  }
+}
+
+void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          int dilationW,
+          int dilationH,
+          bool ceil_mode)
+{
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  int nInputPlane;
+  int inputHeight;
+  int inputWidth;
+  int outputHeight;
+  int outputWidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+  THIndex_t *indices_data;
+
+  THNN_(SpatialDilatedMaxPooling_shapeCheck)
+    (input, gradOutput, indices, kH, kW, dH, dW,
+     padH, padW, dilationH, dilationW, ceil_mode);
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nInputPlane = input->size[dimh-1];
+  inputHeight = input->size[dimh];
+  inputWidth = input->size[dimw];
+  outputHeight = gradOutput->size[dimh];
+  outputWidth = gradOutput->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THIndexTensor_(data)(indices);
+
+  /* backprop */
+  if (input->nDimension == 3)
+  {
+    THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)
+      (gradInput_data, gradOutput_data,
+       indices_data,
+       nInputPlane,
+       inputWidth, inputHeight,
+       outputWidth, outputHeight,
+       dW, dH);
+  }
+  else
+  {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)
+	(gradInput_data+p*nInputPlane*inputWidth*inputHeight,
+	 gradOutput_data+p*nInputPlane*outputWidth*outputHeight,
+	 indices_data+p*nInputPlane*outputWidth*outputHeight,
+	 nInputPlane,
+	 inputWidth, inputHeight,
+	 outputWidth, outputHeight,
+	 dW, dH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialFractionalMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialFractionalMaxPooling.c
new file mode 100644
index 000000000..a98954cc6
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialFractionalMaxPooling.c
@@ -0,0 +1,253 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialFractionalMaxPooling.c"
+#else
+
+static long* THNN_(SpatialFractionalMaxPooling_generateIntervals)(
+  real sample,
+  long inputSize,
+  long outputSize,
+  int poolSize) {
+  real alpha = (real) (inputSize - poolSize) / (real) (outputSize - 1);
+  long* sequence = (long*) THAlloc(sizeof(long) * outputSize);
+
+  long i;
+  for (i = 0; i < outputSize - 1; ++i) {
+    sequence[i] =
+      (long) ((i + sample) * alpha) - (long) (sample * alpha);
+  }
+  sequence[outputSize - 1] = inputSize - poolSize;
+
+  return sequence;
+}
+
+static void THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
+  real* input,
+  real* output,
+  THIndex_t* indices,
+  real* randomSamples,
+  long numPlanes,
+  long inputW, long inputH,
+  long outputW, long outputH,
+  int poolSizeW, int poolSizeH) {
+  long plane;
+#pragma omp parallel for private(plane)
+  for (plane = 0; plane < numPlanes; ++plane) {
+    /* each plane contains 2 random samples, one for W and one for H */
+    real* randomSamplesForPlane = randomSamples + plane * 2;
+
+    /* Generate interval sequence */
+    long* sequenceW =
+      THNN_(SpatialFractionalMaxPooling_generateIntervals)(
+        randomSamplesForPlane[0], inputW, outputW, poolSizeW);
+    long* sequenceH =
+      THNN_(SpatialFractionalMaxPooling_generateIntervals)(
+        randomSamplesForPlane[1], inputH, outputH, poolSizeH);
+
+    /* loop over output */
+    long h, w;
+
+    real* inputForPlane = input + plane * inputW * inputH;
+    real* outputForPlane = output + plane * outputW * outputH;
+    THIndex_t* indicesForPlane = indices + plane * outputW * outputH;
+
+    for (h = 0; h < outputH; ++h) {
+      long inputHStart = sequenceH[h];
+
+      for (w = 0; w < outputW; ++w) {
+        long inputWStart = sequenceW[w];
+
+        real maxVal = -THInf;
+        long maxIndex = -1;
+
+        long h2, w2;
+        for (h2 = inputHStart; h2 < inputHStart + poolSizeH; ++h2) {
+          for (w2 = inputWStart; w2 < inputWStart + poolSizeW; ++w2) {
+            THAssert(h2 >= 0 && h2 < inputH);
+            THAssert(w2 >= 0 && w2 < inputW);
+
+            long planeIndex = h2 * inputW + w2;
+            real val = inputForPlane[planeIndex];
+            if (val > maxVal) {
+              maxVal = val;
+              maxIndex = planeIndex;
+            }
+          }
+        }
+
+        THAssert(maxVal != -THInf);
+        THAssert(maxIndex != -1);
+
+        outputForPlane[h * outputW + w] = maxVal;
+        /* +1 to lua index */
+        indicesForPlane[h * outputW + w] = maxIndex + TH_INDEX_BASE;
+      }
+    }
+
+    THFree(sequenceW);
+    THFree(sequenceH);
+  }
+}
+
+void THNN_(SpatialFractionalMaxPooling_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int outputW, int outputH,
+    int poolSizeW, int poolSizeH,
+    THIndexTensor *indices,
+    THTensor *randomSamples) {
+
+  long numBatch = 1;
+  int planeDim = 0;
+  int heightDim = 1;
+  int widthDim = 2;
+
+  long numInputDims = THTensor_(nDimension)(input);
+  THNN_ARGCHECK(numInputDims == 3 || numInputDims == 4, 2, input,
+		"3D or 4D (batch mode) tensor expected for input, but got: %s");
+
+  if (numInputDims == 4) {
+    numBatch = THTensor_(size)(input, 0);
+    planeDim++;
+    heightDim++;
+    widthDim++;
+  }
+
+  /* sizes */
+  long numPlanes = THTensor_(size)(input, planeDim);
+  long inputH = THTensor_(size)(input, heightDim);
+  long inputW = THTensor_(size)(input, widthDim);
+
+  THArgCheck(outputH + poolSizeH - 1 < inputH, 7,
+             "poolSizeH (%d) too large relative to input height (%d)",
+	     poolSizeH, inputH);
+  THArgCheck(outputW + poolSizeW - 1 < inputW, 6,
+             "poolSizeW (%d) too large relative to input width (%d)",
+	     poolSizeW, inputW);
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (numInputDims == 3) {
+    /* resize output */
+    THTensor_(resize3d)(output, numPlanes, outputH, outputW);
+    /* indices will contain the locations for each output point */
+    THIndexTensor_(resize3d)(indices, numPlanes, outputH, outputW);
+
+    THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
+      THTensor_(data)(input),
+      THTensor_(data)(output),
+      THIndexTensor_(data)(indices),
+      THTensor_(data)(randomSamples),
+      numPlanes, inputW, inputH, outputW, outputH, poolSizeW, poolSizeH);
+  } else {
+    THTensor_(resize4d)(output, numBatch, numPlanes, outputH, outputW);
+    /* indices will contain the locations for each output point */
+    THIndexTensor_(resize4d)(indices, numBatch, numPlanes, outputH, outputW);
+
+    long batch;
+#pragma omp parallel for private(batch)
+    for (batch = 0; batch < numBatch; ++batch) {
+      THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
+        THTensor_(data)(input) + batch * numPlanes * inputH * inputW,
+        THTensor_(data)(output) + batch * numPlanes * outputH * outputW,
+        THIndexTensor_(data)(indices) + batch * numPlanes * outputH * outputW,
+        THTensor_(data)(randomSamples) + batch * numPlanes * 2,
+        numPlanes, inputW, inputH, outputW, outputH, poolSizeW, poolSizeH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
+  real* gradInput,
+  real* gradOutput,
+  THIndex_t* indices,
+  long numPlanes,
+  long inputW, long inputH,
+  long outputW, long outputH) {
+  long plane;
+#pragma omp parallel for private(plane)
+  for (plane = 0; plane < numPlanes; plane++) {
+    real* gradInputForPlane = gradInput + plane * inputW * inputH;
+    real* gradOutputForPlane = gradOutput + plane * outputW * outputH;
+    THIndex_t* indicesForPlane = indices + plane * outputW * outputH;
+
+    long h, w;
+    for (h = 0; h < outputH; ++h) {
+      for (w = 0; w < outputW; ++w) {
+        long outputIndex = h * outputW + w;
+        long index = indicesForPlane[outputIndex] - TH_INDEX_BASE;
+        THAssert(index >= 0 && index < inputW * inputH);
+
+        gradInputForPlane[index] += gradOutputForPlane[outputIndex];
+      }
+    }
+  }
+}
+
+void THNN_(SpatialFractionalMaxPooling_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int outputW, int outputH,
+    int poolSizeW, int poolSizeH,
+    THIndexTensor *indices) {
+
+  long numBatch = 1;
+  int planeDim = 0;
+  int heightDim = 1;
+  int widthDim = 2;
+
+  long numInputDims = THTensor_(nDimension)(input);
+  if (numInputDims == 4) {
+    numBatch = THTensor_(size)(input, 0);
+    planeDim = 1;
+    heightDim++;
+    widthDim++;
+  }
+
+  /* sizes */
+  long numPlanes = THTensor_(size)(input, planeDim);
+  long inputH = THTensor_(size)(input, heightDim);
+  long inputW = THTensor_(size)(input, widthDim);
+
+  THArgCheck(outputW == THTensor_(size)(gradOutput, widthDim), 3,
+             "gradOutput width unexpected");
+  THArgCheck(outputH == THTensor_(size)(gradOutput, heightDim), 3,
+             "gradOutput height unexpected");
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (numInputDims == 3) {
+    THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      THIndexTensor_(data)(indices),
+      numPlanes, inputW, inputH, outputW, outputH);
+  } else {
+    long batch;
+#pragma omp parallel for private(batch)
+    for (batch = 0; batch < numBatch; ++batch) {
+      THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + batch * numPlanes * inputH * inputW,
+        THTensor_(data)(gradOutput) + batch * numPlanes * outputH * outputW,
+        THIndexTensor_(data)(indices) + batch * numPlanes * outputH * outputW,
+        numPlanes, inputW, inputH, outputW, outputH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialFullConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialFullConvolution.c
new file mode 100644
index 000000000..2edc53b5a
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialFullConvolution.c
@@ -0,0 +1,462 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialFullConvolution.c"
+#else
+
+static void THNN_(im2col)(const real* data_im, const int channels,
+      const int height, const int width, const int kernel_h, const int kernel_w,
+      const int pad_h, const int pad_w,
+      const int stride_h, const int stride_w,
+      const int dilation_h, const int dilation_w,
+      real* data_col) {
+  const int height_col = (height + 2 * pad_h -
+                          (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_col = (width + 2 * pad_w -
+                         (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+  const int channels_col = channels * kernel_h * kernel_w;
+  for (int c_col = 0; c_col < channels_col; ++c_col) {
+    int w_offset = c_col % kernel_w;
+    int h_offset = (c_col / kernel_w) % kernel_h;
+    int c_im = c_col / kernel_h / kernel_w;
+    for (int h_col = 0; h_col < height_col; ++h_col) {
+      for (int w_col = 0; w_col < width_col; ++w_col) {
+        int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
+        int w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
+        data_col[(c_col * height_col + h_col) * width_col + w_col] =
+          (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
+          data_im[(c_im * height + h_im) * width + w_im] : 0;
+      }
+    }
+  }
+}
+
+static void THNN_(col2im)(const real* data_col, const int channels,
+      const int height, const int width, const int kernel_h, const int kernel_w,
+      const int pad_h, const int pad_w,
+      const int stride_h, const int stride_w,
+      const int dilation_h, const int dilation_w,
+      real* data_im) {
+  memset(data_im, 0, sizeof(real) * height * width * channels);
+  const int height_col = (height + 2 * pad_h -
+                          (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_col = (width + 2 * pad_w -
+                         (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+  const int channels_col = channels * kernel_h * kernel_w;
+  for (int c_col = 0; c_col < channels_col; ++c_col) {
+    int w_offset = c_col % kernel_w;
+    int h_offset = (c_col / kernel_w) % kernel_h;
+    int c_im = c_col / kernel_h / kernel_w;
+    for (int h_col = 0; h_col < height_col; ++h_col) {
+      for (int w_col = 0; w_col < width_col; ++w_col) {
+        int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
+        int w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
+        if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width)
+          data_im[(c_im * height + h_im) * width + w_im] +=
+            data_col[(c_col * height_col + h_col) * width_col + w_col];
+      }
+    }
+  }
+}
+
+static inline void THNN_(SpatialFullConvolution_shapeCheck)(
+	THTensor *input, THTensor *gradOutput,
+	THTensor *weight, THTensor *bias,
+	int kH, int kW, int dH, int dW, int padH, int padW, int adjH, int adjW) {
+
+  THArgCheck(kW > 0 && kH > 0, 9,
+	       "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 11,
+	     "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+  THArgCheck(adjW < dW && adjH < dH, 15,
+        "output adjustment must be smaller than stride, but got adjH: %d adjW: %d dH: %d dW: %d",
+        adjH, adjW, dH, dW);
+  THNN_ARGCHECK(weight->nDimension == 2 || weight->nDimension == 4, 5, weight,
+		"2D or 4D weight tensor expected, but got: %s");
+
+  if (bias != NULL) {
+    THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[1]);
+  }
+
+  int ndim = input->nDimension;
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
+		"3D or 4D input tensor expected but got: %s");
+
+  long nInputPlane  = weight->size[0];
+  long inputHeight  = input->size[dimh];
+  long inputWidth   = input->size[dimw];
+  long nOutputPlane = weight->size[1];
+  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%d x %d x %d). "
+	    "Calculated output size: (%d x %d x %d). Output size is too small",
+	    nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
+
+  THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+void THNN_(SpatialFullConvolution_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *weight,
+    THTensor *bias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int adjW, int adjH)
+{
+  THNN_(SpatialFullConvolution_shapeCheck)
+    (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW, adjH, adjW);
+
+  int nInputPlane = THTensor_(size)(weight,0);
+  int nOutputPlane = THTensor_(size)(weight,1);
+
+  input = THTensor_(newContiguous)(input);
+  weight = THTensor_(newContiguous)(weight);
+  bias = bias ? THTensor_(newContiguous)(bias) : bias;
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+  }
+
+  long inputHeight  = input->size[2];
+  long inputWidth   = input->size[3];
+  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+  THTensor_(zero)(columns);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize2d)(ones, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *output_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(output_n, output, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = weight->size[1] * weight->size[2] * weight->size[3];
+    long n = columns->size[1];
+    long k = weight->size[0];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        'n', 't',
+        n, m, k,
+        1,
+        THTensor_(data)(input_n), n,
+        THTensor_(data)(weight), m,
+        0,
+        THTensor_(data)(columns), n
+    );
+
+    // Unpack columns back into input:
+    THNN_(col2im)(
+      THTensor_(data)(columns),
+      nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+      1, 1,
+      THTensor_(data)(output_n)
+    );
+
+    // Do Bias after:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m_ = nOutputPlane;
+    long n_ = outputHeight * outputWidth;
+    long k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    if (bias) {
+      THBlas_(gemm)(
+          't', 'n',
+          n_, m_, k_,
+          1,
+          THTensor_(data)(ones), k_,
+          THTensor_(data)(bias), k_,
+          1,
+          THTensor_(data)(output_n), n_
+      );
+    }
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(output_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(weight);
+  if (bias) THTensor_(free)(bias);
+}
+
+void THNN_(SpatialFullConvolution_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *weight,
+    THTensor *gradColumns,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int adjW, int adjH)
+{
+  THNN_(SpatialFullConvolution_shapeCheck)
+    (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW, adjH, adjW);
+
+  int nInputPlane = THTensor_(size)(weight,0);
+  int nOutputPlane = THTensor_(size)(weight,1);
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  weight = THTensor_(newContiguous)(weight);
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
+  THTensor_(zero)(gradInput);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(gradColumns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+
+  // Helpers
+  THTensor *gradInput_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THTensor_(select)(gradInput_n, gradInput, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    THNN_(im2col)(
+      THTensor_(data)(gradOutput_n),
+      nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+      1, 1,
+      THTensor_(data)(gradColumns)
+    );
+
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = weight->size[0];
+    long n = gradColumns->size[1];
+    long k = weight->size[1] * weight->size[2] * weight->size[3];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        'n', 'n',
+        n, m, k,
+        1,
+        THTensor_(data)(gradColumns), n,
+        THTensor_(data)(weight), k,
+        0,
+        THTensor_(data)(gradInput_n), n
+    );
+  }
+
+
+  // Free
+  THTensor_(free)(gradInput_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+    THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
+}
+
+
+void THNN_(SpatialFullConvolution_accGradParameters)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradWeight,
+    THTensor *gradBias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int adjW, int adjH,
+    accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  THNN_(SpatialFullConvolution_shapeCheck)
+    (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW, adjH, adjW);
+
+  int nInputPlane = THTensor_(size)(gradWeight,0);
+  int nOutputPlane = THTensor_(size)(gradWeight,1);
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+  if (gradBias)
+    THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize2d)(ones, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    THNN_(im2col)(
+      THTensor_(data)(gradOutput_n),
+      nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+      1, 1,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long n = columns->size[0];   // nOutputPlane * kh * kw
+    long m = input_n->size[0];   // nInputPlane
+    long k = columns->size[1];   // inputHeight * inputWidth
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        't', 'n',
+        n, m, k,
+        scale,
+        THTensor_(data)(columns), k,
+        THTensor_(data)(input_n), k,
+        1,
+        THTensor_(data)(gradWeight), n
+    );
+
+
+    // Do Bias:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m_ = nOutputPlane;
+    long k_ = outputHeight * outputWidth;
+
+    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+    if (gradBias) {
+      THBlas_(gemv)(
+          't',
+          k_, m_,
+          scale,
+          THTensor_(data)(gradOutput_n), k_,
+          THTensor_(data)(ones), 1,
+          1,
+          THTensor_(data)(gradBias), 1
+      );
+    }
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize
+  if (batch == 0) {
+    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialFullConvolutionMap.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialFullConvolutionMap.c
new file mode 100644
index 000000000..6952fbe25
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialFullConvolutionMap.c
@@ -0,0 +1,222 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialFullConvolutionMap.c"
+#else
+
+void THNN_(SpatialFullConvolutionMap_updateOutput)(
+  THNNState *state, THTensor *input, THTensor *output_, THTensor *weight, THTensor *bias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH)
+{
+  THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+  THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous");
+  THArgCheck(
+    weight != NULL && weight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == weight->size[0], 4,
+    "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+  );
+
+  const int kH = (int)weight->size[1];
+  const int kW = (int)weight->size[2];
+
+  THArgCheck(input != NULL && input->nDimension == 3, 2, "3D tensor expected");
+  THArgCheck(input->size[0] >= nInputPlane, 2, "invalid number of input planes");
+
+  THTensor_(resize3d)(
+    output_, nOutputPlane,
+    (input->size[1] - 1) * dH + kH,
+    (input->size[2] - 1) * dW + kW
+  );
+
+  /* contiguous */
+  input = THTensor_(newContiguous)(input);
+  THTensor* output = THTensor_(newContiguous)(output_);
+
+  /* get raw pointers */
+  real *input_data = THTensor_(data)(input);
+  real *output_data = THTensor_(data)(output);
+  real *weight_data = THTensor_(data)(weight);
+  real *bias_data = THTensor_(data)(bias);
+  real *connTable_data = THTensor_(data)(connTable);
+
+  /* and dims */
+  const long input_h = input->size[1];
+  const long input_w = input->size[2];
+  const long output_h = output->size[1];
+  const long output_w = output->size[2];
+  const long weight_h = weight->size[1];
+  const long weight_w = weight->size[2];
+
+  long p;
+#pragma omp parallel for private(p)
+  for (p = 0; p < nOutputPlane; p++)
+  {
+    /* add bias */
+    real *ptr_output = output_data + p*output_w*output_h;
+    long j;
+    int nweight;
+    long k;
+
+    for (j = 0; j < output_h*output_w; j++)
+      ptr_output[j] = bias_data[p];
+
+    /* convolve all maps */
+    nweight = connTable->size[0];
+    for (k = 0; k < nweight; k++)
+    {
+      /* get offsets for input/output */
+      int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
+      int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
+
+      if (o == p)
+      {
+        THTensor_(fullConv2Dptr)(
+          output_data + o*output_w*output_h,
+          1.0,
+          input_data + i*input_w*input_h, input_h, input_w,
+          weight_data + k*weight_w*weight_h, weight_h, weight_w,
+          dH, dW
+        );
+      }
+    }
+  }
+
+  /* clean up */
+  THTensor_(free)(input);
+  THTensor_(freeCopyTo)(output, output_);
+}
+
+void THNN_(SpatialFullConvolutionMap_updateGradInput)(
+  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput_, THTensor *weight, THTensor *bias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH)
+{
+  THArgCheck(
+    weight != NULL && weight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == weight->size[0], 5,
+    "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+  );
+
+  /* contiguous */
+  THTensor* gradInput = THTensor_(newContiguous)(gradInput_);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* Resize/Zero */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* get raw pointers */
+  real *gradInput_data = THTensor_(data)(gradInput);
+  real *gradOutput_data = THTensor_(data)(gradOutput);
+  real *weight_data = THTensor_(data)(weight);
+  real *connTable_data = THTensor_(data)(connTable);
+
+  /* and dims */
+  const long input_h = input->size[1];
+  const long input_w = input->size[2];
+  const long output_h = gradOutput->size[1];
+  const long output_w = gradOutput->size[2];
+  const long kH = weight->size[1];
+  const long kW = weight->size[2];
+
+  long p;
+#pragma omp parallel for private(p)
+  for (p = 0; p < nInputPlane; p++)
+  {
+    long k;
+    /* backward all */
+    int nkernel = connTable->size[0];
+    for (k = 0; k < nkernel; k++)
+    {
+      int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
+      int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
+      if (i == p)
+      {
+        /* gradient to input */
+        THTensor_(validXCorr2Dptr)(
+          gradInput_data + i*input_w*input_h,
+          1.0,
+          gradOutput_data + o*output_w*output_h,  output_h,  output_w,
+          weight_data + k*kW*kH, kH, kW,
+          dH, dW
+        );
+      }
+    }
+  }
+
+  /* clean up */
+  THTensor_(freeCopyTo)(gradInput, gradInput_);
+  THTensor_(free)(gradOutput);
+}
+
+void THNN_(SpatialFullConvolutionMap_accGradParameters)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradWeight,
+  THTensor *gradBias,
+  THTensor *connTable,
+  int nInputPlane,
+  int nOutputPlane,
+  int dW, int dH,
+  accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  THArgCheck(
+    gradWeight != NULL && gradWeight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5,
+    "3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+  );
+
+  /* contiguous */
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* get raw pointers */
+  real *input_data = THTensor_(data)(input);
+  real *gradOutput_data = THTensor_(data)(gradOutput);
+  real *gradWeight_data = THTensor_(data)(gradWeight);
+  real *gradBias_data = THTensor_(data)(gradBias);
+
+  /* and dims */
+  const long input_h  = input->size[1];
+  const long input_w  = input->size[2];
+  const long output_h = gradOutput->size[1];
+  const long output_w = gradOutput->size[2];
+  const long weight_h = gradWeight->size[1];
+  const long weight_w = gradWeight->size[2];
+
+  /* gradients wrt bias */
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nOutputPlane; k++)
+  {
+    real *ptr_gradOutput = gradOutput_data + k*output_w*output_h;
+    long l;
+    for (l = 0; l < output_h*output_w; l++)
+      gradBias_data[k] += scale*ptr_gradOutput[l];
+  }
+
+  /* gradients wrt weight */
+  int nkernel = connTable->size[0];
+#pragma omp parallel for private(k)
+  for (k = 0; k < nkernel; k++)
+  {
+    int o = (int)THTensor_(get2d)(connTable,k,1) - TH_INDEX_BASE;
+    int i = (int)THTensor_(get2d)(connTable,k,0) - TH_INDEX_BASE;
+
+    /* gradient to kernel */
+    THTensor_(validXCorr2DRevptr)(
+      gradWeight_data + k*weight_w*weight_h,
+      scale,
+      gradOutput_data + o*output_w*output_h, output_h, output_w,
+      input_data + i*input_w*input_h, input_h, input_w,
+      dH, dW
+    );
+  }
+
+  /* clean up */
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialMaxPooling.c
new file mode 100644
index 000000000..88aaa40e1
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialMaxPooling.c
@@ -0,0 +1,44 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialMaxPooling.c"
+#else
+
+void THNN_(SpatialMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          bool ceil_mode)
+{
+  THNN_(SpatialDilatedMaxPooling_updateOutput)(
+      state, input, output, indices,
+      kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode
+    );
+}
+
+void THNN_(SpatialMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          bool ceil_mode)
+{
+  THNN_(SpatialDilatedMaxPooling_updateGradInput)(
+      state, input, gradOutput, gradInput, indices,
+      kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode
+    );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialMaxUnpooling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialMaxUnpooling.c
new file mode 100644
index 000000000..320538686
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialMaxUnpooling.c
@@ -0,0 +1,234 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialMaxUnpooling.c"
+#else
+
+static void THNN_(SpatialMaxUnpooling_updateOutput_frame)(real *input_p, real *output_p,
+                                                      THIndex_t *ind_p,
+                                                      int nslices,
+                                                      int iwidth, int iheight,
+                                                      int owidth, int oheight)
+{
+  int k;
+  int has_error = 0;
+  THIndex_t error_index;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *output_p_k = output_p + k*owidth*oheight;
+    real *input_p_k = input_p + k*iwidth*iheight;
+    THIndex_t *ind_p_k = ind_p + k*iwidth*iheight;
+
+    int i, j;
+    THIndex_t maxp;
+    for(i = 0; i < iheight; i++)
+    {
+      for(j = 0; j < iwidth; j++)
+      {
+        maxp = ind_p_k[i*iwidth + j] - TH_INDEX_BASE;  /* retrieve position of max */
+        if(maxp<0 || maxp>=owidth*oheight){
+#pragma omp critical
+          {
+            has_error = 1;
+            error_index = maxp;
+          }
+        } else {
+          output_p_k[maxp] = input_p_k[i*iwidth + j]; /* update output */
+        }
+      }
+    }
+  }
+  if (has_error) {
+    THError("found an invalid max index %ld (output volumes are of size %dx%d)",
+        error_index, oheight, owidth);
+  }
+}
+
+void THNN_(SpatialMaxUnpooling_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THIndexTensor *indices,
+    int owidth, int oheight)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int nbatch = 1;
+  int nslices;
+  int iheight;
+  int iwidth;
+  real *input_data;
+  real *output_data;
+  THIndex_t *indices_data;
+
+
+  THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+		"3D or 4D (batch mode) tensor expected for input, but got: %s");
+  THNN_CHECK_SHAPE_INDICES(input, indices);
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+
+  /* get contiguous input and indices */
+  input = THTensor_(newContiguous)(input);
+  indices = THIndexTensor_(newContiguous)(indices);
+
+  /* resize output */
+  if (input->nDimension == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+    THTensor_(zero)(output);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+    THNN_(SpatialMaxUnpooling_updateOutput_frame)(input_data, output_data,
+                                              indices_data,
+                                              nslices,
+                                              iwidth, iheight,
+                                              owidth, oheight);
+  }
+  else
+  {
+    int p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+    THTensor_(zero)(output);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialMaxUnpooling_updateOutput_frame)(
+						    input_data+p*nslices*iwidth*iheight,
+						    output_data+p*nslices*owidth*oheight,
+						    indices_data+p*nslices*iwidth*iheight,
+						    nslices,
+						    iwidth, iheight,
+						    owidth, oheight);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+  THIndexTensor_(free)(indices);
+}
+
+static void THNN_(SpatialMaxUnpooling_updateGradInput_frame)(real *gradInput_p, real *gradOutput_p,
+                                                         THIndex_t *ind_p,
+                                                         int nslices,
+                                                         int iwidth, int iheight,
+                                                         int owidth, int oheight)
+{
+  int k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *gradInput_p_k = gradInput_p + k*iwidth*iheight;
+    real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
+    THIndex_t *ind_p_k = ind_p + k*iwidth*iheight;
+
+    int i, j;
+    THIndex_t maxp;
+    for(i = 0; i < iheight; i++)
+    {
+      for(j = 0; j < iwidth; j++)
+      {
+        maxp = ind_p_k[i*iwidth + j] - TH_INDEX_BASE; /* retrieve position of max */
+        if(maxp < 0 || maxp >= owidth * oheight) {
+            THError("invalid max index %ld, owidth= %d, oheight= %d", maxp, owidth, oheight);
+        }
+        gradInput_p_k[i*iwidth + j] = gradOutput_p_k[maxp]; /* update gradient */
+      }
+    }
+  }
+}
+
+void THNN_(SpatialMaxUnpooling_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THIndexTensor *indices,
+    int owidth, int oheight)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int nbatch = 1;
+  int nslices;
+  int iheight;
+  int iwidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+  THIndex_t *indices_data;
+
+  THNN_CHECK_SHAPE_INDICES(input, indices);
+
+  /* get contiguous gradOutput and indices */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  indices = THIndexTensor_(newContiguous)(indices);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+
+  if(owidth!=gradOutput->size[dimw] || oheight!=gradOutput->size[dimh]){
+    THError("Inconsistent gradOutput size. oheight= %d, owidth= %d, gradOutput: %dx%d",
+	    oheight, owidth, gradOutput->size[dimh], gradOutput->size[dimw]);
+  }
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THIndexTensor_(data)(indices);
+
+  /* backprop */
+  if (input->nDimension == 3)
+  {
+    THNN_(SpatialMaxUnpooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+                                                 indices_data,
+                                                 nslices,
+                                                 iwidth, iheight,
+                                                 owidth, oheight);
+  }
+  else
+  {
+    int p;
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialMaxUnpooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
+                                                   indices_data+p*nslices*iwidth*iheight,
+                                                   nslices,
+                                                   iwidth, iheight,
+                                                   owidth, oheight);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+  THIndexTensor_(free)(indices);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialReflectionPadding.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialReflectionPadding.c
new file mode 100644
index 000000000..dcde660ea
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialReflectionPadding.c
@@ -0,0 +1,260 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialReflectionPadding.c"
+#else
+
+static void THNN_(SpatialReflectionPadding_updateOutput_frame)(
+  real *input_p, real *output_p,
+  long nslices,
+  long iwidth, long iheight,
+  long owidth, long oheight,
+  int pad_l, int pad_r,
+  int pad_t, int pad_b)
+{
+  int iStartX = fmax(0, -pad_l);
+  int iStartY = fmax(0, -pad_t);
+  int oStartX = fmax(0, pad_l);
+  int oStartY = fmax(0, pad_t);
+
+  long k, ip_x, ip_y;
+#pragma omp parallel for private(k, ip_x, ip_y)
+
+  for (k = 0; k < nslices; k++)
+  {
+    long i, j;
+    for (i = 0; i < oheight; i++) {
+      for (j = 0; j < owidth; j++) {
+        if (j < pad_l) {
+          ip_x = pad_l * 2 - j;
+        } else if (j >= pad_l && j < iwidth + pad_l) {
+          ip_x = j;
+        } else {
+          ip_x = (iwidth + pad_l - 1) * 2 - j;
+        }
+        ip_x = ip_x - oStartX + iStartX;
+
+        if (i < pad_t) {
+          ip_y = pad_t * 2 - i;
+        } else if (i >= pad_t && i < iheight + pad_t) {
+          ip_y = i;
+        } else {
+          ip_y = (iheight + pad_t - 1) * 2 - i;
+        }
+        ip_y = ip_y - oStartY + iStartY;
+
+        real *dest_p = output_p + k*owidth*oheight + i * owidth + j;
+        real *src_p = input_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
+        *dest_p = *src_p;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialReflectionPadding_updateOutput)(THNNState *state,
+                                                  THTensor *input,
+                                                  THTensor *output,
+                                                  int pad_l, int pad_r,
+                                                  int pad_t, int pad_b)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+
+  THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+		"3D or 4D (batch mode) tensor expected for input, but got: %s");
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = iheight + pad_t + pad_b;
+  owidth  = iwidth + pad_l + pad_r;
+
+  THArgCheck(owidth >= 1 || oheight >= 1 , 2,
+	     "input (H: %d, W: %d)is too small."
+	     " Calculated output H: %d W: %d",
+	     iheight, iwidth, oheight, owidth);
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  /* resize output */
+  if (input->nDimension == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(SpatialReflectionPadding_updateOutput_frame)(input_data, output_data,
+                                                    nslices,
+                                                    iwidth, iheight,
+                                                    owidth, oheight,
+                                                    pad_l, pad_r,
+                                                    pad_t, pad_b);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialReflectionPadding_updateOutput_frame)(
+        input_data+p*nslices*iwidth*iheight,
+        output_data+p*nslices*owidth*oheight,
+        nslices,
+        iwidth, iheight,
+        owidth, oheight,
+        pad_l, pad_r,
+        pad_t, pad_b);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(SpatialReflectionPadding_updateGradInput_frame)(
+  real *ginput_p, real *goutput_p,
+  long nslices,
+  long iwidth, long iheight,
+  long owidth, long oheight,
+  int pad_l, int pad_r,
+  int pad_t, int pad_b)
+{
+  int iStartX = fmax(0, -pad_l);
+  int iStartY = fmax(0, -pad_t);
+  int oStartX = fmax(0, pad_l);
+  int oStartY = fmax(0, pad_t);
+
+  long k, ip_x, ip_y;
+#pragma omp parallel for private(k, ip_x, ip_y)
+
+  for (k = 0; k < nslices; k++)
+  {
+    long i, j;
+    for (i = 0; i < oheight; i++) {
+      for (j = 0; j < owidth; j++) {
+        if (j < pad_l) {
+          ip_x = pad_l * 2 - j;
+        } else if (j >= pad_l && j < iwidth + pad_l) {
+          ip_x = j;
+        } else {
+          ip_x = (iwidth + pad_l - 1) * 2 - j;
+        }
+        ip_x = ip_x - oStartX + iStartX;
+
+        if (i < pad_t) {
+          ip_y = pad_t * 2 - i;
+        } else if (i >= pad_t && i < iheight + pad_t) {
+          ip_y = i;
+        } else {
+          ip_y = (iheight + pad_t - 1) * 2 - i;
+        }
+        ip_y = ip_y - oStartY + iStartY;
+
+        real *src_p = goutput_p + k*owidth*oheight + i * owidth + j;
+        real *dest_p = ginput_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
+        *dest_p += *src_p;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialReflectionPadding_updateGradInput)(THNNState *state,
+                                                      THTensor *input,
+                                                      THTensor *gradOutput,
+                                                      THTensor *gradInput,
+                                                      int pad_l, int pad_r,
+                                                      int pad_t, int pad_b)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+  long oheight;
+  long owidth;
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = iheight + pad_t + pad_b;
+  owidth  = iwidth + pad_l + pad_r;
+
+  THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
+	     "gradOutput width unexpected. Expected: %d, Got: %d",
+	     owidth, THTensor_(size)(gradOutput, dimw));
+  THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
+                "gradOutput height unexpected. Expected: %d, Got: %d",
+	     oheight, THTensor_(size)(gradOutput, dimh));
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (input->nDimension == 3) {
+    THNN_(SpatialReflectionPadding_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      nslices,
+      iwidth, iheight,
+      owidth, oheight,
+      pad_l, pad_r,
+      pad_t, pad_b);
+  } else {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++) {
+      THNN_(SpatialReflectionPadding_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + p * nslices * iheight * iwidth,
+        THTensor_(data)(gradOutput) + p * nslices * oheight * owidth,
+        nslices,
+        iwidth, iheight,
+        owidth, oheight,
+        pad_l, pad_r,
+        pad_t, pad_b);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialReplicationPadding.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialReplicationPadding.c
new file mode 100644
index 000000000..4e318aa70
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialReplicationPadding.c
@@ -0,0 +1,260 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialReplicationPadding.c"
+#else
+
+static void THNN_(SpatialReplicationPadding_updateOutput_frame)(
+  real *input_p, real *output_p,
+  long nslices,
+  long iwidth, long iheight,
+  long owidth, long oheight,
+  int pad_l, int pad_r,
+  int pad_t, int pad_b)
+{
+  int iStartX = fmax(0, -pad_l);
+  int iStartY = fmax(0, -pad_t);
+  int oStartX = fmax(0, pad_l);
+  int oStartY = fmax(0, pad_t);
+
+  long k, ip_x, ip_y;
+#pragma omp parallel for private(k, ip_x, ip_y)
+  for (k = 0; k < nslices; k++)
+  {
+    long i, j;
+    for (i = 0; i < oheight; i++) {
+      for (j = 0; j < owidth; j++) {
+        if (j < pad_l) {
+          ip_x = pad_l;
+        } else if (j >= pad_l && j < iwidth + pad_l) {
+          ip_x = j;
+        } else {
+          ip_x = iwidth + pad_l - 1;
+        }
+        ip_x = ip_x - oStartX + iStartX;
+
+        if (i < pad_t) {
+          ip_y = pad_t;
+        } else if (i >= pad_t && i < iheight + pad_t) {
+          ip_y = i;
+        } else {
+          ip_y = iheight + pad_t - 1;
+        }
+        ip_y = ip_y - oStartY + iStartY;
+
+        real *dest_p = output_p + k*owidth*oheight + i * owidth + j;
+        real *src_p = input_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
+        *dest_p = *src_p;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialReplicationPadding_updateOutput)(THNNState *state,
+                                                         THTensor *input,
+                                                         THTensor *output,
+                                                         int pad_l, int pad_r,
+                                                         int pad_t, int pad_b)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+
+  THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+		"3D or 4D (batch mode) tensor expected for input, but got: %s");
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = iheight + pad_t + pad_b;
+  owidth  = iwidth + pad_l + pad_r;
+
+  THArgCheck(owidth >= 1 || oheight >= 1 , 2,
+	     "input (H: %d, W: %d)is too small."
+	     " Calculated output H: %d W: %d",
+	     iheight, iwidth, oheight, owidth);
+
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  /* resize output */
+  if (input->nDimension == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(SpatialReplicationPadding_updateOutput_frame)(input_data, output_data,
+                                                    nslices,
+                                                    iwidth, iheight,
+                                                    owidth, oheight,
+                                                    pad_l, pad_r,
+                                                    pad_t, pad_b);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialReplicationPadding_updateOutput_frame)(
+        input_data+p*nslices*iwidth*iheight,
+        output_data+p*nslices*owidth*oheight,
+        nslices,
+        iwidth, iheight,
+        owidth, oheight,
+        pad_l, pad_r,
+        pad_t, pad_b);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(SpatialReplicationPadding_updateGradInput_frame)(
+  real *ginput_p, real *goutput_p,
+  long nslices,
+  long iwidth, long iheight,
+  long owidth, long oheight,
+  int pad_l, int pad_r,
+  int pad_t, int pad_b)
+{
+  int iStartX = fmax(0, -pad_l);
+  int iStartY = fmax(0, -pad_t);
+  int oStartX = fmax(0, pad_l);
+  int oStartY = fmax(0, pad_t);
+
+  long k, ip_x, ip_y;
+#pragma omp parallel for private(k, ip_x, ip_y)
+  for (k = 0; k < nslices; k++)
+  {
+    long i, j;
+    for (i = 0; i < oheight; i++) {
+      for (j = 0; j < owidth; j++) {
+        if (j < pad_l) {
+          ip_x = pad_l;
+        } else if (j >= pad_l && j < iwidth + pad_l) {
+          ip_x = j;
+        } else {
+          ip_x = iwidth + pad_l - 1;
+        }
+        ip_x = ip_x - oStartX + iStartX;
+
+        if (i < pad_t) {
+          ip_y = pad_t;
+        } else if (i >= pad_t && i < iheight + pad_t) {
+          ip_y = i;
+        } else {
+          ip_y = iheight + pad_t - 1;
+        }
+        ip_y = ip_y - oStartY + iStartY;
+
+        real *src_p = goutput_p + k*owidth*oheight + i * owidth + j;
+        real *dest_p = ginput_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
+        *dest_p += *src_p;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialReplicationPadding_updateGradInput)(THNNState *state,
+                                                      THTensor *input,
+                                                      THTensor *gradOutput,
+                                                      THTensor *gradInput,
+                                                      int pad_l, int pad_r,
+                                                      int pad_t, int pad_b)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+  long oheight;
+  long owidth;
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = iheight + pad_t + pad_b;
+  owidth  = iwidth + pad_l + pad_r;
+
+  THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
+	     "gradOutput width unexpected. Expected: %d, Got: %d",
+	     owidth, THTensor_(size)(gradOutput, dimw));
+  THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
+                "gradOutput height unexpected. Expected: %d, Got: %d",
+	     oheight, THTensor_(size)(gradOutput, dimh));
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (input->nDimension == 3) {
+    THNN_(SpatialReplicationPadding_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      nslices,
+      iwidth, iheight,
+      owidth, oheight,
+      pad_l, pad_r,
+      pad_t, pad_b);
+  } else {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++) {
+      THNN_(SpatialReplicationPadding_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + p * nslices * iheight * iwidth,
+        THTensor_(data)(gradOutput) + p * nslices * oheight * owidth,
+        nslices,
+        iwidth, iheight,
+        owidth, oheight,
+        pad_l, pad_r,
+        pad_t, pad_b);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialSubSampling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialSubSampling.c
new file mode 100644
index 000000000..4c077bc64
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialSubSampling.c
@@ -0,0 +1,302 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialSubSampling.c"
+#else
+
+static inline void THNN_(SpatialSubSampling_shapeCheck)(
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         THTensor *weight,
+                         int kW, int kH) {
+  int ndims = input->nDimension;
+  THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+                  "3D or 4D input tensor expected but got: %s");
+  THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+
+  int nInputPlane = THTensor_(size)(weight, 0);
+
+  int dimw = 2;
+  int dimh = 1;
+
+  long inputWidth;
+  long inputHeight;
+
+  if (input->nDimension == 4) {
+    dimw++;
+    dimh++;
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+
+  THArgCheck(input->size[dimh-1] == nInputPlane, 2, "invalid number of input planes");
+  THArgCheck(inputWidth >= kW && inputHeight >= kH, 2, "input image smaller than kernel size");
+}
+
+void THNN_(SpatialSubSampling_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *weight,
+    THTensor *bias,
+    int kW, int kH,
+    int dW, int dH)
+{
+  THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous");
+
+  real *weight_data = THTensor_(data)(weight);
+  real *bias_data = THTensor_(data)(bias);
+  real *output_data;
+  real *input_data;
+
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+
+  long inputWidth;
+  long inputHeight;
+  long outputWidth;
+  long outputHeight;
+
+  int nInputPlane = THTensor_(size)(weight,0);
+
+  long k;
+
+  THNN_(SpatialSubSampling_shapeCheck)(input, NULL, weight, kW, kH);
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  outputWidth = (inputWidth - kW) / dW + 1;
+  outputHeight = (inputHeight - kH) / dH + 1;
+
+  if (input->nDimension == 3)
+    THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
+  else
+    THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth);
+
+  input = THTensor_(newContiguous)(input);
+  input_data = THTensor_(data)(input);
+  output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    long p;
+    for(p = 0; p < nbatch; p++)
+    {
+      long xx, yy;
+      /* For all output pixels... */
+      real *ptr_output = output_data + p*nInputPlane*outputWidth*outputHeight + k*outputWidth*outputHeight;
+      /* Get the good mask for (k,i) (k out, i in) */
+      real the_weight = weight_data[k];
+      /* Initialize to the bias */
+      real z = bias_data[k];
+      long i;
+      for(i = 0; i < outputWidth*outputHeight; i++)
+        ptr_output[i] = z;
+
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          /* Compute the mean of the input image... */
+          real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW;
+          real sum = 0;
+          long kx, ky;
+
+          for(ky = 0; ky < kH; ky++)
+          {
+            for(kx = 0; kx < kW; kx++)
+              sum += ptr_input[kx];
+            ptr_input += inputWidth; /* next input line */
+          }
+          /* Update output */
+          *ptr_output++ += the_weight*sum;
+        }
+      }
+    }
+  }
+  THTensor_(free)(input);
+}
+
+void THNN_(SpatialSubSampling_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *weight,
+    int kW, int kH,
+    int dW, int dH)
+{
+  THNN_(SpatialSubSampling_shapeCheck)(input, gradOutput, weight, kW, kH);
+
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+
+  long inputWidth;
+  long inputHeight;
+  long outputWidth;
+  long outputHeight;
+
+  int nInputPlane = THTensor_(size)(weight,0);
+
+  real *weight_data;
+  real *gradOutput_data;
+  real *input_data, *gradInput_data;
+
+  long k;
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  outputWidth = (inputWidth - kW) / dW + 1;
+  outputHeight = (inputHeight - kH) / dH + 1;
+
+  weight_data = THTensor_(data)(weight);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+  input_data = THTensor_(data)(input);
+
+  THTensor_(resizeAs)(gradInput, input);
+  gradInput_data = THTensor_(data)(gradInput);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    long p;
+    for(p = 0; p < nbatch; p++)
+    {
+      real the_weight = weight_data[k];
+      real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
+      long xx, yy;
+
+      real* ptr_gi = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+      long i;
+      for(i=0; i<inputWidth*inputHeight; i++)
+        ptr_gi[i] = 0.0;
+
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          real *ptr_gradInput = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW;
+          real z = *ptr_gradOutput++ * the_weight;
+          long kx, ky;
+
+          for(ky = 0; ky < kH; ky++)
+          {
+            for(kx = 0; kx < kW; kx++)
+              ptr_gradInput[kx] += z;
+            ptr_gradInput += inputWidth;
+          }
+        }
+      }
+    }
+  }
+  THTensor_(free)(gradOutput);
+}
+
+void THNN_(SpatialSubSampling_accGradParameters)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradWeight,
+    THTensor *gradBias,
+    int kW, int kH,
+    int dW, int dH,
+    accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  THNN_(SpatialSubSampling_shapeCheck)(input, gradOutput, gradWeight, kW, kH);
+
+  long nbatch = 1;
+  long dimw = 2;
+  long dimh = 1;
+
+  long inputWidth;
+  long inputHeight;
+  long outputWidth;
+  long outputHeight;
+
+  int nInputPlane = THTensor_(size)(gradWeight,0);
+
+  real *gradWeight_data;
+  real *gradBias_data;
+  real *gradOutput_data;
+  real *input_data;
+
+  long k;
+
+  if (input->nDimension == 4) {
+    dimw++;
+    dimh++;
+    nbatch = input->size[0];
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  outputWidth = (inputWidth - kW) / dW + 1;
+  outputHeight = (inputHeight - kH) / dH + 1;
+
+  gradWeight_data = THTensor_(data)(gradWeight);
+  gradBias_data = THTensor_(data)(gradBias);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+  input = THTensor_(newContiguous)(input);
+  input_data = THTensor_(data)(input);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    long p;
+    for(p = 0; p < nbatch; p++)
+    {
+      real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
+      real sum;
+      long xx, yy;
+      long i;
+
+      sum = 0;
+      for(i = 0; i < outputWidth*outputHeight; i++)
+        sum += ptr_gradOutput[i];
+      gradBias_data[k] += scale*sum;
+
+      sum = 0;
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW;
+          real z = *ptr_gradOutput++;
+          long kx, ky;
+
+          for(ky = 0; ky < kH; ky++)
+          {
+            for(kx = 0; kx < kW; kx++)
+              sum += z * ptr_input[kx];
+            ptr_input += inputWidth;
+          }
+        }
+      }
+      gradWeight_data[k] += scale*sum;
+    }
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialUpSamplingBilinear.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialUpSamplingBilinear.c
new file mode 100644
index 000000000..8bc487ead
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialUpSamplingBilinear.c
@@ -0,0 +1,174 @@
+// Adapted from interp.cpp from Caffe util by Pauline Luc
+// Originally developed by George Papandreou
+
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialUpSamplingBilinear.c"
+#else
+
+static inline void THNN_(SpatialUpSamplingBilinear_shapeCheck)
+     (THTensor *input, THTensor *gradOutput,
+      int nBatch, int nChannels,
+      int inputHeight, int inputWidth,
+      int outputHeight, int outputWidth) {
+  THArgCheck(inputHeight > 0 && inputWidth > 0
+	     && outputHeight > 0 && outputWidth > 0, 2,
+	     "input and output sizes should be greater than 0,"
+	     " but got input (H: %d, W: %d) output (H: %d, W: %d)",
+	     inputHeight, inputWidth, outputHeight, outputWidth);
+  if (input != NULL) {
+    THNN_ARGCHECK(input->nDimension == 4, 2, input,
+		  "4D input tensor expected but got: %s");
+  }
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 0, nBatch);
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 1, nChannels);
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 2, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 3, outputWidth);
+  }
+}
+
+void THNN_(SpatialUpSamplingBilinear_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int outputHeight,
+    int outputWidth){
+
+  int nbatch = THTensor_(size)(input, 0);
+  int channels = THTensor_(size)(input, 1);
+  int inputHeight = THTensor_(size)(input, 2);
+  int inputWidth = THTensor_(size)(input, 3);
+
+  THNN_(SpatialUpSamplingBilinear_shapeCheck)
+    (input, NULL,
+     nbatch, channels,
+     inputHeight, inputWidth,
+     outputHeight, outputWidth);
+
+  input = THTensor_(newContiguous)(input);
+  THTensor_(resize4d)(output,
+		      THTensor_(size)(input, 0),
+		      THTensor_(size)(input, 1),
+		      outputHeight, outputWidth);
+  THTensor_(zero)(output);
+  real *idata = THTensor_(data)(input);
+  real *odata = THTensor_(data)(output);
+  channels = nbatch * channels;
+  THAssert(inputHeight > 0 && inputWidth > 0 && outputHeight > 0 && outputWidth > 0);
+  // special case: just copy
+  if (inputHeight == outputHeight && inputWidth == outputWidth) {
+    for (int h2 = 0; h2 < outputHeight; ++h2) {
+      const int h1 = h2;
+      for (int w2 = 0; w2 < outputWidth; ++w2) {
+        const int w1 = w2;
+        const real* pos1 = &idata[h1 * inputWidth + w1];
+        real* pos2 = &odata[h2 * outputWidth + w2];
+        for (int c = 0; c < channels; ++c) {
+          pos2[0] = pos1[0];
+          pos1 += inputWidth * inputHeight;
+          pos2 += outputWidth * outputHeight;
+        }
+      }
+    }
+    return;
+  }
+  const float rheight =(outputHeight > 1) ? (float)(inputHeight - 1)/(outputHeight - 1) : 0.f;
+  const float rwidth = (outputWidth > 1) ? (float)(inputWidth - 1) / (outputWidth - 1) : 0.f;
+  for (int h2 = 0; h2 < outputHeight; ++h2) {
+    const float h1r = rheight * h2;
+    const int h1 = h1r;
+    const int h1p = (h1 < inputHeight - 1) ? 1 : 0;
+    const real h1lambda = h1r - h1;
+    const real h0lambda = (real)1. - h1lambda;
+    for (int w2 = 0; w2 < outputWidth; ++w2) {
+      const float w1r = rwidth * w2;
+      const int w1 = w1r;
+      const int w1p = (w1 < inputWidth - 1) ? 1 : 0;
+      const real w1lambda = w1r - w1;
+      const real w0lambda = (real)1. - w1lambda;
+      const real* pos1 = &idata[h1 * inputWidth + w1];
+      real* pos2 = &odata[h2 * outputWidth + w2];
+      for (int c = 0; c < channels; ++c) {
+        pos2[0] = h0lambda * (w0lambda * pos1[0]+ w1lambda * pos1[w1p])
+                  + h1lambda * (w0lambda * pos1[h1p * inputWidth]
+                  + w1lambda * pos1[h1p * inputWidth + w1p]);
+        pos1 += inputWidth * inputHeight;
+        pos2 += outputWidth * outputHeight;
+      }
+    }
+  }
+  THTensor_(free)(input);
+}
+
+void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
+    THNNState *state,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int nbatch,
+    int channels,
+    int inputHeight,
+    int inputWidth,
+    int outputHeight,
+    int outputWidth){
+
+  THNN_(SpatialUpSamplingBilinear_shapeCheck)
+    (NULL, gradOutput,
+     nbatch, channels,
+     inputHeight, inputWidth,
+     outputHeight, outputWidth);
+
+  THTensor_(resize4d)(gradInput, nbatch, channels, inputHeight, inputWidth);
+  THTensor_(zero)(gradInput);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  real *data1 = THTensor_(data)(gradInput);
+  real *data2 = THTensor_(data)(gradOutput);
+  channels = nbatch * channels;
+
+  // special case: same-size matching grids
+  if (inputHeight == outputHeight && inputWidth == outputWidth) {
+    for (int h2 = 0; h2 < outputHeight; ++h2) {
+      const int h1 = h2;
+      for (int w2 = 0; w2 < outputWidth; ++w2) {
+        const int w1 = w2;
+        real* pos1 = &data1[h1 * inputWidth + w1];
+        const real* pos2 = &data2[h2 * outputWidth + w2];
+        for (int c = 0; c < channels; ++c) {
+          pos1[0] += pos2[0];
+          pos1 += inputWidth * inputHeight;
+          pos2 += outputWidth * outputHeight;
+        }
+      }
+    }
+    return;
+  }
+  const float rheight =(outputHeight > 1) ? (float)(inputHeight - 1)/(outputHeight - 1) : 0.f;
+  const float rwidth = (outputWidth > 1) ? (float)(inputWidth - 1)/(outputWidth - 1) : 0.f;
+  for (int h2 = 0; h2 < outputHeight; ++h2) {
+    const float h1r = rheight * h2;
+    const int h1 = h1r;
+    const int h1p = (h1 < inputHeight - 1) ? 1 : 0;
+    const real h1lambda = h1r - h1;
+    const real h0lambda = (real)1. - h1lambda;
+    for (int w2 = 0; w2 < outputWidth; ++w2) {
+      const float w1r = rwidth * w2;
+      const int w1 = w1r;
+      const int w1p = (w1 < inputWidth - 1) ? 1 : 0;
+      const real w1lambda = w1r - w1;
+      const real w0lambda = (real)1. - w1lambda;
+      real* pos1 = &data1[h1 * inputWidth + w1];
+      const real* pos2 = &data2[h2 * outputWidth + w2];
+      for (int c = 0; c < channels; ++c) {
+        pos1[0] += h0lambda * w0lambda * pos2[0];
+        pos1[w1p] += h0lambda * w1lambda * pos2[0];
+        pos1[h1p * inputWidth] += h1lambda * w0lambda * pos2[0];
+        pos1[h1p * inputWidth + w1p] += h1lambda * w1lambda * pos2[0];
+        pos1 += inputWidth * inputHeight;
+        pos2 += outputWidth * outputHeight;
+      }
+    }
+  }
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialUpSamplingNearest.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialUpSamplingNearest.c
new file mode 100644
index 000000000..b4699ff3e
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialUpSamplingNearest.c
@@ -0,0 +1,199 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialUpSamplingNearest.c"
+#else
+
+
+static inline void THNN_(SpatialUpSamplingNearest_shapeCheck)
+     (THTensor *input, THTensor *gradOutput,
+      int scale_factor) {
+  THArgCheck(input != NULL, 2, "4D input tensor expected but got NULL");
+  THArgCheck(scale_factor > 1, 4,
+	     "scale_factor must be greater than 1, but got: %d", scale_factor);
+  THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+		"3D or 4D input tensor expected but got: %s");
+  if (input->nDimension == 3) {
+    int nChannels    = THTensor_(size)(input, 0);
+    int inputHeight  = THTensor_(size)(input, 1);
+    int inputWidth   = THTensor_(size)(input, 2);
+    int outputHeight = inputHeight * scale_factor;
+    int outputWidth  = inputWidth  * scale_factor;
+    if (gradOutput != NULL) {
+      THNN_CHECK_DIM_SIZE(gradOutput, 3, 0, nChannels);
+      THNN_CHECK_DIM_SIZE(gradOutput, 3, 1, outputHeight);
+      THNN_CHECK_DIM_SIZE(gradOutput, 3, 2, outputWidth);
+    }
+  } else {
+    int nBatch       = THTensor_(size)(input, 0);
+    int nChannels    = THTensor_(size)(input, 1);
+    int inputHeight  = THTensor_(size)(input, 2);
+    int inputWidth   = THTensor_(size)(input, 3);
+    int outputHeight = inputHeight * scale_factor;
+    int outputWidth  = inputWidth  * scale_factor;
+    if (gradOutput != NULL) {
+      THNN_CHECK_DIM_SIZE(gradOutput, 4, 0, nBatch);
+      THNN_CHECK_DIM_SIZE(gradOutput, 4, 1, nChannels);
+      THNN_CHECK_DIM_SIZE(gradOutput, 4, 2, outputHeight);
+      THNN_CHECK_DIM_SIZE(gradOutput, 4, 3, outputWidth);
+    }
+  }
+}
+
+void THNN_(SpatialUpSamplingNearest_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int scale_factor)
+{
+  THNN_(SpatialUpSamplingNearest_shapeCheck)(input, NULL, scale_factor);
+  int inputHeight = THTensor_(size)(input, input->nDimension-2);
+  int inputWidth  = THTensor_(size)(input,  input->nDimension-1);
+  int outputHeight = inputHeight * scale_factor;
+  int outputWidth = inputWidth * scale_factor;
+
+  if (input->nDimension == 3) {
+    THTensor_(resize3d)(output,
+			THTensor_(size)(input, 0),
+			outputHeight, outputWidth);
+  } else {
+    THTensor_(resize4d)(output,
+			THTensor_(size)(input, 0),
+			THTensor_(size)(input, 1),
+			outputHeight, outputWidth);
+  }
+
+  int dW = scale_factor;
+  int dH = scale_factor;
+  int xDim = input->nDimension-2;
+  int yDim = input->nDimension-1;
+
+  // dims
+  int idim = input->nDimension;
+  int osz0 = output->size[0];
+  int osz1 = output->size[1];
+  int osz2 = output->size[2];
+  int osz3 = 1;
+  if (idim > 3) {
+    osz3 = output->size[3];
+  }
+
+  // get strides
+  long *is = input->stride;
+  long *os = output->stride;
+
+  // get raw pointers
+  real *pin = THTensor_(data)(input);
+  real *pout = THTensor_(data)(output);
+
+  // perform the upsampling
+  int i0, i1, i2, i3, isrc, idst;
+  int iout[4];  // Output indices
+  int iin[4];  // Input indices
+
+  for (i0 = 0; i0 < osz0; i0++) {
+    iout[0] = i0;
+    iin[0] = i0;
+    for (i1 = 0; i1 < osz1; i1++) {
+      iout[1] = i1;
+      iin[1] = i1;
+      for (i2 = 0; i2 < osz2; i2++) {
+        iout[2] = i2;
+        iin[2] = i2;
+        for (i3 = 0; i3 < osz3; i3++) {
+          iout[3] = i3;
+          iin[3] = i3;
+
+          // set the indices for the upsampled dimensions
+          iin[xDim] = iout[xDim] / dW;
+          iin[yDim] = iout[yDim] / dH;
+
+          idst = i0*os[0] + i1*os[1] + i2*os[2];
+          isrc = iin[0]*is[0] + iin[1]*is[1] + iin[2]*is[2];
+          if (idim > 3) {
+            idst += i3*os[3];
+            isrc += iin[3]*is[3];
+          }
+
+          pout[idst] = pin[isrc];
+        }
+      }
+    }
+  }
+}
+
+void THNN_(SpatialUpSamplingNearest_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int scale_factor)
+{
+  THNN_(SpatialUpSamplingNearest_shapeCheck)(input, gradOutput, scale_factor);
+  THTensor_(resizeAs)(gradInput, input);
+
+  int dW = scale_factor;
+  int dH = scale_factor;
+  int xDim = gradInput->nDimension-2;
+  int yDim = gradInput->nDimension-1;
+
+  // dims
+  int idim = gradInput->nDimension;  // Guaranteed to be between 3 and 5
+  int isz0 = gradInput->size[0];
+  int isz1 = gradInput->size[1];
+  int isz2 = gradInput->size[2];
+  int isz3 = 1;
+  if (idim > 3) {
+    isz3 = gradInput->size[3];
+  }
+
+  // get strides
+  long *is = gradInput->stride;
+  long *os = gradOutput->stride;
+
+  // get raw pointers
+  real *pin = THTensor_(data)(gradInput);
+  real *pout = THTensor_(data)(gradOutput);
+
+  // perform the upsampling
+  int i0, i1, i2, i3, isrc, idst, x, y;
+  int iin[4];  // Input indices
+  int iout[4];  // Output indices
+
+  THTensor_(zero)(gradInput);
+
+  for (i0 = 0; i0 < isz0; i0++) {
+    iin[0] = i0;
+    iout[0] = i0;
+    for (i1 = 0; i1 < isz1; i1++) {
+      iin[1] = i1;
+      iout[1] = i1;
+      for (i2 = 0; i2 < isz2; i2++) {
+        iin[2] = i2;
+        iout[2] = i2;
+        for (i3 = 0; i3 < isz3; i3++) {
+          iin[3] = i3;
+          iout[3] = i3;
+
+          idst = i0*is[0] + i1*is[1] + i2*is[2];
+          if (idim > 3) {
+            idst += i3*is[3];
+          }
+
+          // Now accumulate the gradients from gradOutput
+          for (y = 0; y < dH; y++) {
+            for (x = 0; x < dW; x++) {
+              iout[xDim] = dW * iin[xDim] + x;
+              iout[yDim] = dH * iin[yDim] + y;
+              isrc = iout[0]*os[0] + iout[1]*os[1] + iout[2]*os[2];
+              if (idim > 3) {
+                isrc += iout[3]*os[3];
+              }
+              pin[idst] += pout[isrc];
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/Sqrt.c b/contrib/lua-torch/nn/lib/THNN/generic/Sqrt.c
new file mode 100644
index 000000000..174884e34
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/Sqrt.c
@@ -0,0 +1,52 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Sqrt.c"
+#else
+
+void THNN_(Sqrt_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal eps_)
+{
+  real eps = TH_CONVERT_ACCREAL_TO_REAL(eps_);
+  THTensor_(resizeAs)(output, input);
+  THTensor_(sqrt)(output, input);
+}
+
+void THNN_(Sqrt_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output)
+{
+  THNN_CHECK_SHAPE(output, gradOutput);
+  THTensor_(resizeAs)(gradInput, input);
+
+  if (output->nDimension == 1 ||
+      !THTensor_(isContiguous)(output) ||
+      !THTensor_(isContiguous)(gradOutput) ||
+      !THTensor_(isContiguous)(gradInput))
+  {
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+      *gradInput_data = (*output_data == 0.0) ? 0.0 : (0.5 * (*gradOutput_data / *output_data));
+    );
+  }
+  else
+  {
+    real *gradOutput_data = THTensor_(data)(gradOutput);
+    real *gradInput_data  = THTensor_(data)(gradInput);
+    real *output_data     = THTensor_(data)(output);
+    long i;
+#pragma omp parallel for private(i)
+    for(i = 0; i < THTensor_(nElement)(output); i++)
+    {
+      if (output_data[i] == 0.0)
+        gradInput_data[i] = 0.0;
+      else
+        gradInput_data[i] = 0.5 * (gradOutput_data[i] / output_data[i]);
+    }
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/Square.c b/contrib/lua-torch/nn/lib/THNN/generic/Square.c
new file mode 100644
index 000000000..aad0a911c
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/Square.c
@@ -0,0 +1,59 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Square.c"
+#else
+
+void THNN_(Square_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  THTensor_(resizeAs)(output, input);
+
+  if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output))
+  {
+    TH_TENSOR_APPLY2(real, output, real, input,
+      *output_data = (*input_data) * (*input_data);
+    );
+  }
+  else
+  {
+    real *output_data = THTensor_(data)(output);
+    real *input_data  = THTensor_(data)(input);
+    long i;
+#pragma omp parallel for private(i)
+    for (i = 0; i < THTensor_(nElement)(input); i++)
+      output_data[i] = input_data[i]*input_data[i];
+  }
+}
+
+void THNN_(Square_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput)
+{
+  THNN_CHECK_SHAPE(input, gradOutput);
+  THTensor_(resizeAs)(gradInput, input);
+
+  if (input->nDimension == 1 ||
+      !THTensor_(isContiguous)(input) ||
+      !THTensor_(isContiguous)(gradOutput) ||
+      !THTensor_(isContiguous)(gradInput))
+  {
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+      *gradInput_data  = 2.0 * (*gradOutput_data) * (*input_data);
+    );
+  }
+  else
+  {
+    real *gradOutput_data = THTensor_(data)(gradOutput);
+    real *gradInput_data  = THTensor_(data)(gradInput);
+    real *input_data  = THTensor_(data)(input);
+    long i;
+#pragma omp parallel for private(i)
+    for (i = 0; i < THTensor_(nElement)(gradInput); i++)
+      gradInput_data[i] = 2.0 * gradOutput_data[i] * input_data[i];
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/THNN.h b/contrib/lua-torch/nn/lib/THNN/generic/THNN.h
new file mode 100644
index 000000000..76a28eb2d
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/THNN.h
@@ -0,0 +1,1501 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THNN.h"
+#else
+
+TH_API void THNN_(Abs_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output);           // [OUT] Abs output
+TH_API void THNN_(Abs_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // gradient w.r.t. output
+          THTensor *gradInput);        // [OUT] gradient w.r.t. input
+
+TH_API void THNN_(AbsCriterion_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // tensor with target values
+          THTensor *output,            // [OUT] a one-element tensor with loss
+          bool sizeAverage);           // if true, the loss will be divided by batch size
+TH_API void THNN_(AbsCriterion_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // tensor with target values
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          bool sizeAverage);           // if true, the gradient will be normalized by batch size
+
+TH_API void THNN_(BCECriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          THTensor *weights);          // [OPTIONAL]
+TH_API void THNN_(BCECriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          THTensor *weights);          // [OPTIONAL]
+
+TH_API void THNN_(ClassNLLCriterion_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor (1D/2D)
+          THIndexTensor *target,       // tensor containing indexes of target classes
+          THTensor *output,            // [OUT] a one-element tensor with loss
+          bool sizeAverage,            // if true, the loss will be normalized by batch size and class weights
+          THTensor *weights,           // [OPTIONAL] class weights
+          THTensor *total_weight,      // [BUFFER]
+          long ignore_index);          // target index to ignore (loss = 0, gradInput = 0)
+TH_API void THNN_(ClassNLLCriterion_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor (1D/2D)
+          THIndexTensor *target,       // tensor containing indexes of target classes
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          bool sizeAverage,            // if true, the loss will be normalized by batch size and class weights
+          THTensor *weights,           // [OPTIONAL] class weights
+          THTensor *total_weight,      // [BUFFER]
+          long ignore_index);          // target index to ignore (loss = 0, gradInput = 0)
+
+TH_API void THNN_(SpatialClassNLLCriterion_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor (4D)
+          THIndexTensor *target,       // tensor containing indexes of target classes (3D)
+          THTensor *output,            // [OUT] a one-element tensor with loss
+          bool sizeAverage,            // if true, the loss will be normalized by batch size and class weights
+          THTensor *weights,           // [OPTIONAL] class weights
+          THTensor *total_weight);     // [BUFFER]
+TH_API void THNN_(SpatialClassNLLCriterion_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor (4D)
+          THIndexTensor *target,       // tensor containing indexes of target classes (3D)
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          bool sizeAverage,            // if true, the loss will be normalized by batch size and class weights
+          THTensor *weights,           // [OPTIONAL] class weights
+          THTensor *total_weight);     // [BUFFER]
+
+TH_API void THNN_(ELU_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output,            // [OUT] ELU output
+          accreal alpha,               // an ELU parameter (as in paper)
+          bool inplace);               // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated)
+TH_API void THNN_(ELU_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // gradient w.r.t. output
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          THTensor *output,            // output from a forward pass
+          accreal alpha,               // an ELU parameter (as in paper)
+          bool inplace);               // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated)
+
+TH_API void THNN_(DistKLDivCriterion_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // target tensor
+          THTensor *output,            // [OUT] a one-element tensor containing the loss
+          bool sizeAverage);           // if true, the loss will be normalized **by total number of elements**
+TH_API void THNN_(DistKLDivCriterion_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // target tensor
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          bool sizeAverage);           // if true, the loss will be normalized **by total number of elements**
+
+TH_API void THNN_(GatedLinear_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output,            // [OUT] output tensor, half size of input along dimension dim
+          int dim);                    // dimension for halving operation
+TH_API void THNN_(GatedLinear_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // gradient w.r.t module's output
+          THTensor *gradInput,         // [OUT] gradient w.r.t input
+          int dim);                    // dimension for halving operation
+
+// HardShink outputs 0 on interval of (-lambda; lambda) or original value otherwise.
+TH_API void THNN_(HardShrink_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output,            // [OUT] output tensor
+          accreal lambda);             // HardShrink parameter
+TH_API void THNN_(HardShrink_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // gradient w.r.t. module's output
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          accreal lambda);             // HardShrink parameter
+
+// HardTanh clamps the values to the interval [min_val; max_val].
+TH_API void THNN_(HardTanh_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output,            // [OUT] output tensor
+          accreal min_val,             // lower threshold
+          accreal max_val,             // upper threshold
+          bool inplace);
+TH_API void THNN_(HardTanh_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // gradient w.r.t. module's output
+          THTensor *gradInput,         // [OUT] gradient w.r.t. the input
+          accreal min_val,             // lower threshold
+          accreal max_val,             // upper threshold
+          bool inplace);
+
+TH_API void THNN_(L1Cost_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output);           // [OUT] output tensor
+TH_API void THNN_(L1Cost_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // [OPTIONAL] gradient w.r.t module's output
+          THTensor *gradInput);        // [OUT] gradient w.r.t the input
+
+TH_API void THNN_(LeakyReLU_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // [MODIFIED] input tensor
+          THTensor *output,            // [OUT] output tensor
+          accreal negval,              // negative part slope
+          bool inplace);               // if true, modifies the input tensor and sets the output tensor on it (no additional memory is allocated)
+TH_API void THNN_(LeakyReLU_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // [MODIFIED] gradient w.r.t. module's output
+          THTensor *gradInput,         // [OUT] gradient w.r.t. the input
+          accreal negval,              // negative part slope
+          bool inplace);               // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated)
+
+TH_API void THNN_(GRUFused_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *hidden,
+          THTensor *bias1, // [OPTIONAL]
+          THTensor *bias2, // [OPTIONAL]
+          THTensor *hx,
+          THTensor *output,
+          THTensor *storage);
+TH_API void THNN_(GRUFused_updateGradInput)(
+          THNNState *state,
+          THTensor *gradInInput,
+          THTensor *gradInHidden,
+          THTensor *gradOutput,
+          THTensor *gradInputHx,
+          THTensor *storage);
+
+TH_API void THNN_(LSTMFused_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *hidden,
+          THTensor *bias1, // [OPTIONAL]
+          THTensor *bias2, // [OPTIONAL]
+          THTensor *cell,
+          THTensor *output,
+          THTensor *outputCell);
+TH_API void THNN_(LSTMFused_updateGradInput)(
+          THNNState *state,
+          THTensor *storage,
+          THTensor *gradInGates,
+          THTensor *cx,
+          THTensor *cy,
+          THTensor *gradOutput,
+          THTensor *gradOutputCell,
+          THTensor *gradInputCx);
+
+TH_API void THNN_(LogSigmoid_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output,            // output tensor
+          THTensor *buffer);           // [BUFFER]
+TH_API void THNN_(LogSigmoid_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input
+          THTensor *gradOutput,        // gradient w.r.t. module's output
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          THTensor *buffer);           // [BUFFER]
+
+TH_API void THNN_(LogSoftMax_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output);           // [OUT] output tensor
+TH_API void THNN_(LogSoftMax_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // gradient w.r.t. module's output
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          THTensor *output);           // module's output
+
+TH_API void THNN_(LookupTable_accGradParameters)(
+          THNNState *state,
+          THIndexTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THIntegerTensor *count,
+          THTensor *sorted,            // [OPTIONAL]
+          THIndexTensor *indices,      // [OPTIONAL]
+          bool scaleGradByFreq,
+          int paddingValue,
+          accreal scale);
+
+TH_API void THNN_(LookupTable_renorm)(
+          THNNState *state,            // library's state
+          THIndexTensor *idx,          // vector containing row indices (modified in function)
+          THTensor *weight,            // 2D tensor whose rows will be renormalized
+          accreal maxNorm,             // maximum norm
+          accreal normType);           // the norm type (e.g., normType=2, then it's 2-norm)
+
+TH_API void THNN_(MarginCriterion_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // target tensor (should contain only 1s and -1s)
+          THTensor *output,            // [OUT] a one-element tensor containing the loss
+          bool sizeAverage,            // if true, the loss is normalized by **total number of elements**
+          accreal margin);             // a margin that is required for the loss to be 0
+
+TH_API void THNN_(MarginCriterion_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // target tensor (should contin only 1s and -1s)
+          THTensor *gradInput,         // [OUT] gradient w.r.t. module's input
+          bool sizeAverage,            // if true, the gradient is normalized by **total number of elements**
+          accreal margin);             // a margin that is required for the loss to be 0
+
+TH_API void THNN_(SoftMarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage);
+
+TH_API void THNN_(SoftMarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage);
+
+TH_API void THNN_(MSECriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage);
+TH_API void THNN_(MSECriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage);
+
+TH_API void THNN_(MultiLabelMarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *output,
+          THTensor *isTarget,
+          bool sizeAverage);
+TH_API void THNN_(MultiLabelMarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *gradInput,
+          THTensor *isTarget,
+          bool sizeAverage);
+
+TH_API void THNN_(MultiMarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          int p,
+          THTensor* weights,      // [OPTIONAL]
+          accreal margin);
+TH_API void THNN_(MultiMarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          int p,
+          THTensor *weights,      // [OPTIONAL]
+          accreal margin);
+
+TH_API void THNN_(PReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THIndex_t nOutputPlane);
+TH_API void THNN_(PReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THIndex_t nOutputPlane);
+TH_API void THNN_(PReLU_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradWeight,
+          THTensor *gradWeightBuf,
+          THTensor *gradWeightBuf2,
+          THIndex_t nOutputPlane,
+          accreal scale);
+
+TH_API void THNN_(Linear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *addBuffer);
+TH_API void THNN_(Linear_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight);
+TH_API void THNN_(Linear_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *addBuffer,
+          accreal scale);
+
+TH_API void THNN_(RReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *noise,
+          accreal lower,
+          accreal upper,
+          bool train,
+          bool inplace,
+          THGenerator *generator);
+TH_API void THNN_(RReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *noise,
+          accreal lower,
+          accreal upper,
+          bool train,
+          bool inplace);
+
+TH_API void THNN_(Sigmoid_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+TH_API void THNN_(Sigmoid_updateGradInput)(
+          THNNState *state,
+          THTensor *input,             // [OPTIONAL]
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+
+TH_API void THNN_(SmoothL1Criterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage);
+TH_API void THNN_(SmoothL1Criterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage);
+
+TH_API void THNN_(SoftMax_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+TH_API void THNN_(SoftMax_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+
+TH_API void THNN_(SoftPlus_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal beta,
+          accreal threshold);
+TH_API void THNN_(SoftPlus_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output,
+          accreal beta,
+          accreal threshold);
+
+TH_API void THNN_(SoftShrink_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal lambda);
+TH_API void THNN_(SoftShrink_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          accreal lambda);
+
+
+TH_API void THNN_(IndexLinear_updateOutput)(
+          THNNState *state,
+          THIndexTensor *keys,
+          long keysOffset,
+          THTensor *values,
+          THIndexTensor *sizes,
+          THIndexTensor *cumSumSizes,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *normalizedValues,
+          int   train);
+TH_API void THNN_(IndexLinear_accGradParameters)(
+          THNNState *state,
+          THIndexTensor *keys,
+          long keysOffset,
+          THTensor *values,
+          THIndexTensor *sizes,
+          THIndexTensor *cumSumSizes,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor* valuesBuffer,
+          accreal weightDecay,
+          accreal scale);
+TH_API void THNN_(IndexLinear_accUpdateGradParameters)(
+          THNNState *state,
+          THIndexTensor *keys,
+          long keysOffset,
+          THTensor *values,
+          THIndexTensor *sizes,
+          THIndexTensor *cumSumSizes,
+          THTensor *gradOutput,
+          THTensor *weight,
+          THTensor *bias,
+          accreal weightDecay,
+          accreal scale);
+TH_API void THNN_(IndexLinear_updateParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          THIndexTensor *runningKeys,
+          THIndexTensor *cumSumSizes,
+          long keysOffset,
+          accreal weightDecay,
+          accreal learningRate);
+
+TH_API void THNN_(SparseLinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias);
+TH_API void THNN_(SparseLinear_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          accreal weightDecay,
+          accreal scale);
+TH_API void THNN_(SparseLinear_zeroGradParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput);
+TH_API void THNN_(SparseLinear_updateParameters)(
+          THNNState *state,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput,
+          accreal learningRate);
+TH_API void THNN_(SparseLinear_legacyUpdateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias);
+TH_API void THNN_(SparseLinear_legacyAccGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          accreal weightDecay,
+          accreal scale);
+TH_API void THNN_(SparseLinear_legacyZeroGradParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput);
+TH_API void THNN_(SparseLinear_legacyUpdateParameters)(
+          THNNState *state,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput,
+          accreal learningRate);
+
+TH_API void THNN_(Sqrt_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal eps);
+TH_API void THNN_(Sqrt_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+
+TH_API void THNN_(Square_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+TH_API void THNN_(Square_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput);
+
+TH_API void THNN_(Tanh_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+TH_API void THNN_(Tanh_updateGradInput)(
+          THNNState *state,
+          THTensor *input,             // [OPTIONAL]
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+
+TH_API void THNN_(Threshold_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal threshold,
+          accreal val,
+          bool inplace);
+TH_API void THNN_(Threshold_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          accreal threshold,
+          accreal val,
+          bool inplace);
+
+TH_API void THNN_(TemporalConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW, int dW,
+          int inputFrameSize,
+          int outputFrameSize);
+TH_API void THNN_(TemporalConvolution_updateGradInput)(
+          THNNState* state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW, int dW);
+TH_API void THNN_(TemporalConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW, int dW,
+          accreal scale);
+TH_API void THNN_(TemporalMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kW, int dW);
+TH_API void THNN_(TemporalMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kW, int dW);
+TH_API void THNN_(TemporalSubSampling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW, int dW,
+          int inputFrameSize);
+TH_API void THNN_(TemporalSubSampling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW, int dW);
+TH_API void THNN_(TemporalSubSampling_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW, int dW,
+          accreal scale);
+
+TH_API void THNN_(TemporalRowConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int dW,
+          int padW,
+          bool featFirst);
+TH_API void THNN_(TemporalRowConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int dW,
+          int padW,
+          bool featFirst);
+TH_API void THNN_(TemporalRowConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int dW,
+          int padW,
+          bool featFirst,
+          accreal scale);
+
+TH_API void THNN_(BatchNormalization_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,       // [OPTIONAL]
+          THTensor *bias,         // [OPTIONAL]
+          THTensor *running_mean,
+          THTensor *running_var,
+          THTensor *save_mean,
+          THTensor *save_std,
+          bool train,
+          double momentum,
+          double eps);
+TH_API void THNN_(BatchNormalization_backward)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,    // [OPTIONAL]
+          THTensor *gradWeight,   // [OPTIONAL]
+          THTensor *gradBias,     // [OPTIONAL]
+          THTensor *weight,       // [OPTIONAL]
+          THTensor *running_mean,
+          THTensor *running_var,
+          THTensor *save_mean,
+          THTensor *save_std,
+          bool train,
+          double scale,
+          double eps);
+
+TH_API void THNN_(SpatialConvolutionMap_updateOutput)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *output,       // [OUT] convolution output
+          THTensor *weight,       // 3D weight tensor (connTable:size(1) x kH x kW)
+          THTensor *bias,         // 1D bias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH);        // stride
+TH_API void THNN_(SpatialConvolutionMap_updateGradInput)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *gradOutput,   // gradient w.r.t. output
+          THTensor *gradInput,    // [OUT] gradient w.r.t. input
+          THTensor *weight,       // 3D weight tensor (connTable:size(1) x kH x kW)
+          THTensor *bias,         // 1D bias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH);        // stride
+TH_API void THNN_(SpatialConvolutionMap_accGradParameters)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *gradOutput,   // gradient w.r.t. output
+          THTensor *gradWeight,   // 3D gradWeight tensor (connTable:size(1) x kH x kW)
+          THTensor *gradBias,     // 1D gradBias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH,         // stride
+          accreal scale);         // scaling factor
+
+TH_API void THNN_(SpatialConvolutionMM_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,         // [OPTIONAL]
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH);
+TH_API void THNN_(SpatialConvolutionMM_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH);
+TH_API void THNN_(SpatialConvolutionMM_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,     // [OPTIONAL]
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          accreal scale);
+
+TH_API void THNN_(SpatialDepthWiseConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,         // [OPTIONAL]
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH);
+TH_API void THNN_(SpatialDepthWiseConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH);
+TH_API void THNN_(SpatialDepthWiseConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,     // [OPTIONAL]
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          accreal scale);
+
+TH_API void THNN_(SpatialConvolutionLocal_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          long inputWidth, long inputHeight,
+          long outputWidth, long outputHeight);
+TH_API void THNN_(SpatialConvolutionLocal_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          long inputWidth, long inputHeight,
+          long outputWidth, long outputHeight);
+TH_API void THNN_(SpatialConvolutionLocal_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          long inputWidth, long inputHeight,
+          long outputWidth, long outputHeight,
+          accreal scale);
+
+TH_API void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int owidth, int oheight);
+TH_API void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices);
+
+TH_API void THNN_(SpatialAdaptiveAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int owidth, int oheight);
+TH_API void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput);
+
+TH_API void THNN_(SpatialAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          bool ceil_mode,
+          bool count_include_pad);
+TH_API void THNN_(SpatialAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          bool ceil_mode,
+          bool count_include_pad);
+
+TH_API void THNN_(SpatialFractionalMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int outputW, int outputH,
+          int poolSizeW, int poolSizeH,
+          THIndexTensor *indices,
+          THTensor *randomSamples);
+TH_API void THNN_(SpatialFractionalMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int outputW, int outputH,
+          int poolSizeW, int poolSizeH,
+          THIndexTensor *indices);
+
+TH_API void THNN_(SpatialFullConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,         // [OPTIONAL]
+          THTensor *columns,
+          THTensor *ones,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int adjW, int adjH);
+TH_API void THNN_(SpatialFullConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradColumns,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int adjW, int adjH);
+TH_API void THNN_(SpatialFullConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,     // [OPTIONAL]
+          THTensor *columns,
+          THTensor *ones,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int adjW, int adjH,
+          accreal scale);
+
+TH_API void THNN_(SpatialFullConvolutionMap_updateOutput)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *output,       // [OUT] convolution output
+          THTensor *weight,       // 3D weight tensor (connTable:size(1) x kH x kW)
+          THTensor *bias,         // 1D bias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH);        // stride
+TH_API void THNN_(SpatialFullConvolutionMap_updateGradInput)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *gradOutput,   // gradient w.r.t. output
+          THTensor *gradInput,    // [OUT] gradient w.r.t. input
+          THTensor *weight,       // 3D weight tensor (connTable:size(1) x kH x kW)
+          THTensor *bias,         // 1D bias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH);        // stride
+TH_API void THNN_(SpatialFullConvolutionMap_accGradParameters)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *gradOutput,   // gradient w.r.t. output
+          THTensor *gradWeight,   // 3D gradWeight tensor (connTable:size(1) x kH x kW)
+          THTensor *gradBias,     // 1D gradBias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH,         // stride
+          accreal scale);         // scaling factor
+
+TH_API void THNN_(SpatialDilatedConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,         // [OPTIONAL]
+          THTensor *columns,
+          THTensor *ones,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int dilationW, int dilationH);
+
+TH_API void THNN_(SpatialDilatedConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradColumns,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int dilationW, int dilationH);
+
+TH_API void THNN_(SpatialDilatedConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,     // [OPTIONAL]
+          THTensor *columns,
+          THTensor *ones,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int dilationW, int dilationH,
+          accreal scale);
+
+TH_API void THNN_(SpatialMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          bool ceil_mode);
+TH_API void THNN_(SpatialMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          bool ceil_mode);
+
+TH_API void THNN_(SpatialDilatedMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int dilationW, int dilationH,
+          bool ceil_mode);
+TH_API void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int dilationW, int dilationH,
+          bool ceil_mode);
+
+TH_API void THNN_(SpatialMaxUnpooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int owidth, int oheight);
+TH_API void THNN_(SpatialMaxUnpooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int owidth, int oheight);
+
+TH_API void THNN_(SpatialSubSampling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW, int kH,
+          int dW, int dH);
+TH_API void THNN_(SpatialSubSampling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW, int kH,
+          int dW, int dH);
+TH_API void THNN_(SpatialSubSampling_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW, int kH,
+          int dW, int dH,
+          accreal scale);
+
+TH_API void THNN_(SpatialUpSamplingNearest_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int scale_factor);
+TH_API void THNN_(SpatialUpSamplingNearest_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int scale_factor);
+
+TH_API void THNN_(SpatialUpSamplingBilinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+	  int outputHeight,
+          int outputWidth);
+TH_API void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
+          THNNState *state,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int nbatch,
+          int nchannels,
+          int inputHeight,
+          int inputWidth,
+          int outputHeight,
+          int outputWidth);
+
+TH_API void THNN_(unfolded_acc)(
+          THTensor *finput,
+          THTensor *input,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int nInputPlane,
+          int inputWidth, int inputHeight,
+          int outputWidth, int outputHeight);
+TH_API void THNN_(unfolded_copy)(
+          THTensor *finput,
+          THTensor *input,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int nInputPlane,
+          int inputWidth, int inputHeight,
+          int outputWidth, int outputHeight);
+
+TH_API void THNN_(VolumetricAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH);
+TH_API void THNN_(VolumetricAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH);
+
+TH_API void THNN_(VolumetricConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,           // [OPTIONAL]
+          THTensor *finput,
+          THTensor *fgradInput,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+TH_API void THNN_(VolumetricConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+TH_API void THNN_(VolumetricConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,       // [OPTIONAL]
+          THTensor *finput,
+          THTensor *fgradInput,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          accreal scale);
+
+TH_API void THNN_(VolumetricConvolutionMM_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,           // [OPTIONAL]
+          THTensor *finput,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+TH_API void THNN_(VolumetricConvolutionMM_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+TH_API void THNN_(VolumetricConvolutionMM_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,       // [OPTIONAL]
+          THTensor *finput,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          accreal scale);
+
+TH_API void THNN_(VolumetricFractionalMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int outputT, int outputW, int outputH,
+          int poolSizeT, int poolSizeW, int poolSizeH,
+          THIndexTensor *indices,
+          THTensor *randomSamples);
+TH_API void THNN_(VolumetricFractionalMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int outputT, int outputW, int outputH,
+          int poolSizeT, int poolSizeW, int poolSizeH,
+          THIndexTensor *indices);
+
+TH_API void THNN_(VolumetricFullConvolution_updateOutput)(
+          THNNState *state,         // library state
+          THTensor *input,          // 4D or 5D (batch) tensor
+          THTensor *output,         // [OUT] volumetric convolution output
+          THTensor *weight,         // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+          THTensor *bias,           // [OPTIONAL] gradBias tensor (nOutputPlane)
+          THTensor *finput,         // [OUT] internal columns buffer
+          THTensor *fgradInput,     // [OUT] internal ones buffer
+          int dT, int dW, int dH,   // stride of the convolution
+          int pT, int pW, int pH,   // padding
+          int aT, int aW, int aH);  // extra output adjustment
+TH_API void THNN_(VolumetricFullConvolution_updateGradInput)(
+          THNNState *state,         // library state
+          THTensor *input,          // 4D or 5D (batch) tensor
+          THTensor *gradOutput,     // gradient w.r.t. output
+          THTensor *gradInput,      // [OUT] gradient w.r.t. input
+          THTensor *weight,         // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+          THTensor *finput,         // internal columns buffer
+          THTensor *fgradInput,     // internal ones buffer
+          int dT, int dW, int dH,   // stride
+          int pT, int pW, int pH,   // padding
+          int aT, int aW, int aH);  // extra output adjustment
+TH_API void THNN_(VolumetricFullConvolution_accGradParameters)(
+          THNNState *state,         // library state
+          THTensor *input,          // 4D or 5D (batch) tensor
+          THTensor *gradOutput,     // gradient w.r.t. output
+          THTensor *gradWeight,     // gradWeight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+          THTensor *gradBias,       // [OPTIONAL] gradBias tensor (nOutputPlane)
+          THTensor *finput,         // internal columns buffer
+          THTensor *fgradInput,     // internal ones buffer
+          int dT, int dW, int dH,   // stride
+          int pT, int pW, int pH,   // padding
+          int aT, int aW, int aH,   // extra output adjustment
+          accreal scale);           // scaling factor
+
+TH_API void THNN_(VolumetricDilatedConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,           // [OPTIONAL]
+          THTensor *columns,
+          THTensor *ones,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH);
+
+TH_API void THNN_(VolumetricDilatedConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradColumns,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH);
+
+TH_API void THNN_(VolumetricDilatedConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,       // [OPTIONAL]
+          THTensor *columns,
+          THTensor *ones,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH,
+          accreal scale);
+
+TH_API void THNN_(VolumetricMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          bool ceilMode);
+TH_API void THNN_(VolumetricMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          bool ceilMode);
+
+TH_API void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          int dilationT, int dilationW, int dilationH,
+          bool ceilMode);
+TH_API void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          int dilationT, int dilationW, int dilationH,
+          bool ceilMode);
+
+TH_API void THNN_(VolumetricMaxUnpooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int oT, int oW, int oH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+TH_API void THNN_(VolumetricMaxUnpooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int oT, int oW, int oH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+
+TH_API void THNN_(SpatialReflectionPadding_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int pad_l, int pad_r,
+          int pad_t, int pad_b);
+
+TH_API void THNN_(SpatialReflectionPadding_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int pad_l, int pad_r,
+          int pad_t, int pad_b);
+
+TH_API void THNN_(SpatialReplicationPadding_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int pad_l, int pad_r,
+          int pad_t, int pad_b);
+
+TH_API void THNN_(SpatialReplicationPadding_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int pad_l, int pad_r,
+          int pad_t, int pad_b);
+
+TH_API void THNN_(VolumetricReplicationPadding_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int pleft, int pright,
+          int ptop, int pbottom,
+          int pfront, int pback);
+
+TH_API void THNN_(VolumetricReplicationPadding_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int pleft, int pright,
+          int ptop, int pbottom,
+          int pfront, int pback);
+
+TH_API void THNN_(VolumetricUpSamplingNearest_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int scale_factor);
+TH_API void THNN_(VolumetricUpSamplingNearest_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int scale_factor);
+
+TH_API void THNN_(VolumetricUpSamplingTrilinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+	  int outputDepth,
+          int outputHeight,
+          int outputWidth);
+TH_API void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)(
+          THNNState *state,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int nbatch,
+          int nchannels,
+          int inputDepth,
+          int inputHeight,
+          int inputWidth,
+          int outputDepth,
+          int outputHeight,
+          int outputWidth);
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/Tanh.c b/contrib/lua-torch/nn/lib/THNN/generic/Tanh.c
new file mode 100644
index 000000000..ecf0708c2
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/Tanh.c
@@ -0,0 +1,49 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Tanh.c"
+#else
+
+void THNN_(Tanh_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  THTensor_(tanh)(output, input);
+}
+
+void THNN_(Tanh_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output)
+{
+  THNN_CHECK_SHAPE(output, gradOutput);
+  THTensor_(resizeAs)(gradInput, output);
+
+  if (output->nDimension == 1 ||
+      !THTensor_(isContiguous)(output) ||
+      !THTensor_(isContiguous)(gradOutput) ||
+      !THTensor_(isContiguous)(gradInput))
+  {
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+      real z = *output_data;            \
+      *gradInput_data = *gradOutput_data * (1. - z*z);
+    );
+  }
+  else
+  {
+    real* ptr_gradOutput = THTensor_(data)(gradOutput);
+    real* ptr_gradInput  = THTensor_(data)(gradInput);
+    real* ptr_output     = THTensor_(data)(output);
+    long i;
+
+#pragma omp parallel for private(i)
+    for (i = 0; i < THTensor_(nElement)(gradInput); i++)
+    {
+      real z = ptr_output[i];
+      ptr_gradInput[i] = ptr_gradOutput[i] * (1. - z*z);
+    }
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/TemporalConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/TemporalConvolution.c
new file mode 100644
index 000000000..8cfd97d85
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/TemporalConvolution.c
@@ -0,0 +1,398 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalConvolution.c"
+#else
+
+static inline void THNN_(TemporalConvolution_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         int kW,
+                         int dW,
+                         int *inputFrameSize) {
+
+  THArgCheck(kW > 0, 9,
+             "kernel size should be greater than zero, but got kW: %d", kW);
+  THArgCheck(dW > 0, 11,
+             "stride should be greater than zero, but got dW: %d", dW);
+
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+
+  if (input->nDimension == 3)
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+  THNN_ARGCHECK(input->nDimension == 2 || input->nDimension == 3, 2, input,
+                  "2D or 3D (batch mode) tensor expected for input, but got: %s");
+  if (inputFrameSize != NULL) {
+    THArgCheck(input->size[dimF] == *inputFrameSize, 2,
+               "invalid input frame size. Got: %d, Expected: %d",
+               input->size[dimF], *inputFrameSize);
+  }
+  THArgCheck(input->size[dimS] >= kW, 2,
+             "input sequence smaller than kernel size. Got: %d, Expected: %d",
+             input->size[dimS], kW);
+}
+
+void THNN_(TemporalConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW,
+          int dW,
+          int inputFrameSize,
+          int outputFrameSize)
+{
+  THTensor *outputWindow, *inputWindow;
+  int nInputFrame, nOutputFrame;
+  long k, i;
+
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+
+  if (input->nDimension == 3)
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+
+  THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+  THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous");
+  THNN_(TemporalConvolution_shapeCheck)
+       (state, input, kW, dW, &inputFrameSize);
+  input = THTensor_(newContiguous)(input);
+  outputWindow = THTensor_(new)();
+  inputWindow = THTensor_(new)();
+
+  nInputFrame = input->size[dimS];
+  nOutputFrame = (nInputFrame - kW) / dW + 1;
+
+  if (input->nDimension == 2)
+  {
+    THTensor_(resize2d)(output,
+                        nOutputFrame,
+                        outputFrameSize);
+
+    /* bias first */
+    for(k = 0; k < nOutputFrame; k++)
+    {
+      THTensor_(select)(outputWindow, output, 0, k);
+      THTensor_(copy)(outputWindow, bias);
+    }
+
+    /* ouch */
+    for(k = 0; nOutputFrame > 0; k++)
+    {
+      long outputFrameStride = (kW-1)/dW+1;
+      long inputFrameStride = outputFrameStride*dW;
+      long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+      nOutputFrame -= nFrame;
+
+      THTensor_(setStorage2d)(inputWindow, input->storage,
+                              input->storageOffset+k*dW*input->size[1],
+                              nFrame, inputFrameStride*input->size[1],
+                              kW*input->size[1], 1);
+
+      THTensor_(setStorage2d)(outputWindow, output->storage,
+                              output->storageOffset + k*output->size[1],
+                              nFrame, outputFrameStride*output->size[1],
+                              output->size[1], 1);
+
+      THTensor *tweight = THTensor_(new)();
+      THTensor_(transpose)(tweight, weight, 0, 1);
+      THTensor_(addmm)(outputWindow, 1, outputWindow, 1, inputWindow, tweight);
+      THTensor_(free)(tweight);
+    }
+  }
+  else
+  {
+    THTensor *outputSample = THTensor_(new)();
+    THTensor *inputSample = THTensor_(new)();
+    int nBatchFrame = input->size[0];
+
+    THTensor_(resize3d)(output,
+                        nBatchFrame,
+                        nOutputFrame,
+                        outputFrameSize);
+
+    for(i = 0; i < nBatchFrame; i++)
+    {
+      THTensor_(select)(outputSample, output, 0, i);
+      THTensor_(select)(inputSample, input, 0, i);
+      long nOutputSampleFrame = nOutputFrame;
+
+      /* bias first */
+      for(k = 0; k < nOutputFrame; k++)
+      {
+        THTensor_(select)(outputWindow, outputSample, 0, k);
+        THTensor_(copy)(outputWindow, bias);
+      }
+
+      /* ouch */
+      for(k = 0; nOutputSampleFrame > 0; k++)
+      {
+        long outputFrameStride = (kW-1)/dW+1;
+        long inputFrameStride = outputFrameStride*dW;
+        long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+        nOutputSampleFrame -= nFrame;
+
+        THTensor_(setStorage2d)(inputWindow, inputSample->storage,
+                                inputSample->storageOffset+k*dW*inputSample->size[1],
+                                nFrame, inputFrameStride*inputSample->size[1],
+                                kW*inputSample->size[1], 1);
+
+        THTensor_(setStorage2d)(outputWindow, outputSample->storage,
+                                outputSample->storageOffset + k*outputSample->size[1],
+                                nFrame, outputFrameStride*outputSample->size[1],
+                                outputSample->size[1], 1);
+
+        THTensor *tweight = THTensor_(new)();
+        THTensor_(transpose)(tweight, weight, 0, 1);
+        THTensor_(addmm)(outputWindow, 1, outputWindow, 1, inputWindow, tweight);
+        THTensor_(free)(tweight);
+      }
+    }
+    THTensor_(free)(outputSample);
+    THTensor_(free)(inputSample);
+  }
+
+  THTensor_(free)(outputWindow);
+  THTensor_(free)(inputWindow);
+  THTensor_(free)(input);
+
+}
+
+void THNN_(TemporalConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW,
+          int dW)
+{
+  long nInputFrame;
+  long nOutputFrame;
+
+  THTensor *gradOutputWindow;
+  THTensor *gradInputWindow;
+  long k, i;
+
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+
+  if (gradOutput->nDimension == 3)
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+
+  THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+  THNN_(TemporalConvolution_shapeCheck)(
+        state, input, kW, dW, NULL);
+  nInputFrame = input->size[dimS];
+  nOutputFrame = gradOutput->size[dimS];
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  gradOutputWindow = THTensor_(new)();
+  gradInputWindow = THTensor_(new)();
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (gradOutput->nDimension == 2)
+  {
+    /* ouch */
+    for(k = 0; nOutputFrame > 0; k++)
+    {
+      long outputFrameStride = (kW-1)/dW+1;
+      long inputFrameStride = outputFrameStride*dW;
+      long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+      nOutputFrame -= nFrame;
+
+      THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage,
+                              gradOutput->storageOffset + k*gradOutput->size[1],
+                              nFrame, outputFrameStride*gradOutput->size[1],
+                              gradOutput->size[1], 1);
+
+      THTensor_(setStorage2d)(gradInputWindow, gradInput->storage,
+                              gradInput->storageOffset+k*dW*gradInput->size[1],
+                              nFrame, inputFrameStride*gradInput->size[1],
+                              kW*gradInput->size[1], 1);
+
+      THTensor_(addmm)(gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight);
+    }
+  }
+  else
+  {
+    THTensor *gradOutputSample = THTensor_(new)();
+    THTensor *gradInputSample = THTensor_(new)();
+    int nBatchFrame = input->size[0];
+
+    for(i = 0; i < nBatchFrame; i++)
+    {
+      THTensor_(select)(gradOutputSample, gradOutput, 0, i);
+      THTensor_(select)(gradInputSample, gradInput, 0, i);
+      int nOutputSampleFrame = nOutputFrame;
+
+      /* ouch */
+      for(k = 0; nOutputSampleFrame > 0; k++)
+      {
+        long outputFrameStride = (kW-1)/dW+1;
+        long inputFrameStride = outputFrameStride*dW;
+        long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+        nOutputSampleFrame -= nFrame;
+
+        THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage,
+                                gradOutputSample->storageOffset + k*gradOutputSample->size[1],
+                                nFrame, outputFrameStride*gradOutputSample->size[1],
+                                gradOutputSample->size[1], 1);
+
+        THTensor_(setStorage2d)(gradInputWindow, gradInputSample->storage,
+                                gradInputSample->storageOffset+k*dW*gradInputSample->size[1],
+                                nFrame, inputFrameStride*gradInputSample->size[1],
+                                kW*gradInputSample->size[1], 1);
+
+        THTensor_(addmm)(gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight);
+      }
+    }
+    THTensor_(free)(gradOutputSample);
+    THTensor_(free)(gradInputSample);
+  }
+
+  THTensor_(free)(gradOutputWindow);
+  THTensor_(free)(gradInputWindow);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(input);
+
+}
+
+void THNN_(TemporalConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW,
+          int dW,
+          accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  long nInputFrame;
+  long nOutputFrame;
+
+  THTensor *gradOutputWindow;
+  THTensor *inputWindow;
+  long k, i;
+
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+
+  if (gradOutput->nDimension == 3)
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+
+  THNN_(TemporalConvolution_shapeCheck)(
+        state, input, kW, dW, NULL);
+  nInputFrame = input->size[dimS];
+  nOutputFrame = gradOutput->size[dimS];
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  gradOutputWindow = THTensor_(new)();
+  inputWindow = THTensor_(new)();
+
+  if (input->nDimension == 2)
+  {
+    /* bias first */
+    for(k = 0; k < nOutputFrame; k++)
+    {
+      THTensor_(select)(gradOutputWindow, gradOutput, 0, k);
+      THTensor_(cadd)(gradBias, gradBias, scale, gradOutputWindow);
+    }
+
+    /* ouch */
+    for(k = 0; nOutputFrame > 0; k++)
+    {
+      long outputFrameStride = (kW-1)/dW+1;
+      long inputFrameStride = outputFrameStride*dW;
+      long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+      nOutputFrame -= nFrame;
+
+      THTensor_(setStorage2d)(inputWindow, input->storage,
+                              input->storageOffset+k*dW*input->size[1],
+                              nFrame, inputFrameStride*input->size[1],
+                              kW*input->size[1], 1);
+
+      THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage,
+                              gradOutput->storageOffset + k*gradOutput->size[1],
+                              nFrame, outputFrameStride*gradOutput->size[1],
+                              gradOutput->size[1], 1);
+
+      THTensor *tgradOutputWindow = THTensor_(new)();
+      THTensor_(transpose)(tgradOutputWindow, gradOutputWindow, 0, 1);
+      THTensor_(addmm)(gradWeight, 1, gradWeight, scale, tgradOutputWindow, inputWindow);
+      THTensor_(free)(tgradOutputWindow);
+    }
+  }
+  else
+  {
+    THTensor *gradOutputSample = THTensor_(new)();
+    THTensor *inputSample = THTensor_(new)();
+    int nBatchFrame = input->size[0];
+
+    for(i = 0; i < nBatchFrame; i++)
+    {
+      THTensor_(select)(gradOutputSample, gradOutput, 0, i);
+      THTensor_(select)(inputSample, input, 0, i);
+      int nOutputSampleFrame = nOutputFrame;
+
+      /* bias first */
+      for(k = 0; k < nOutputFrame; k++)
+      {
+        THTensor_(select)(gradOutputWindow, gradOutputSample, 0, k);
+        THTensor_(cadd)(gradBias, gradBias, scale, gradOutputWindow);
+      }
+
+      /* ouch */
+      for(k = 0; nOutputSampleFrame > 0; k++)
+      {
+        long outputFrameStride = (kW-1)/dW+1;
+        long inputFrameStride = outputFrameStride*dW;
+        long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+        nOutputSampleFrame -= nFrame;
+
+        THTensor_(setStorage2d)(inputWindow, inputSample->storage,
+                                inputSample->storageOffset+k*dW*inputSample->size[1],
+                                nFrame, inputFrameStride*inputSample->size[1],
+                                kW*inputSample->size[1], 1);
+
+        THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage,
+                                gradOutputSample->storageOffset + k*gradOutputSample->size[1],
+                                nFrame, outputFrameStride*gradOutputSample->size[1],
+                                gradOutputSample->size[1], 1);
+
+        THTensor *tgradOutputWindow = THTensor_(new)();
+        THTensor_(transpose)(tgradOutputWindow, gradOutputWindow, 0, 1);
+        THTensor_(addmm)(gradWeight, 1, gradWeight, scale, tgradOutputWindow, inputWindow);
+        THTensor_(free)(tgradOutputWindow);
+      }
+    }
+    THTensor_(free)(gradOutputSample);
+    THTensor_(free)(inputSample);
+  }
+
+  THTensor_(free)(gradOutputWindow);
+  THTensor_(free)(inputWindow);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(input);
+
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/TemporalMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/TemporalMaxPooling.c
new file mode 100644
index 000000000..344c1b3fd
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/TemporalMaxPooling.c
@@ -0,0 +1,283 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalMaxPooling.c"
+#else
+
+static inline void THNN_(TemporalMaxPooling_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         THIndexTensor *indices,
+                         int kW,
+                         int dW) {
+  long niframe;
+  long framesize;
+  long noframe;
+
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+  int ndims = input->nDimension;
+
+  if (input->nDimension == 3)
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+
+  niframe = input->size[dimS];
+  framesize = input->size[dimF];
+  noframe = (niframe - kW) / dW + 1;
+
+  THArgCheck(kW > 0, 5,
+             "kernel size should be greater than zero, but got kW: %d", kW);
+  THArgCheck(dW > 0, 6,
+             "stride should be greater than zero, but got dW: %d", dW);
+
+  THNN_ARGCHECK(input->nDimension == 2 || input->nDimension == 3, 2, input,
+                  "2D or 3D (batch mode) tensor expected for input, but got: %s");
+  THArgCheck(input->size[dimS] >= kW, 2,
+             "input sequence smaller than kernel size. Got: %d, Expected: %d",
+             input->size[dimS], kW);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndims, dimS, noframe);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndims, dimF, framesize)
+  }
+  if (indices != NULL) {
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndims, dimS, noframe);
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndims, dimF, framesize);
+  }
+}
+
+void THNN_(TemporalMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kW,
+          int dW)
+{
+  long niframe;
+  long framesize;
+  long noframe;
+
+  real *input_data;
+  real *output_data;
+  THIndex_t *indices_data;
+
+  long t, y;
+
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+
+  THNN_(TemporalMaxPooling_shapeCheck)(state, input, NULL, NULL, kW, dW);
+
+  if (input->nDimension == 3)
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+
+  /* sizes */
+  niframe = input->size[dimS];
+  framesize = input->size[dimF];
+  noframe = (niframe - kW) / dW + 1;
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (input->nDimension == 2)
+  {
+    /* resize output */
+    THTensor_(resize2d)(output, noframe, framesize);
+
+    /* indices will contain index locations for each output point */
+    THIndexTensor_(resize2d)(indices, noframe, framesize);
+
+    /* get raw pointers */
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+    for(t = 0; t < noframe; t++)
+    {
+      real *ip = input_data + t*framesize*dW;
+      real *op = output_data + t*framesize;
+      THIndex_t *xp = indices_data + t*framesize;
+#pragma omp parallel for private(y)
+      for(y = 0; y < framesize; y++)
+      {
+        /* compute local max: */
+        long maxindex = -1;
+        real maxval = -THInf;
+        long x;
+        for(x = 0; x < kW; x++)
+        {
+          real val = ip[x*framesize+y];
+          if (val > maxval)
+          {
+            maxval = val;
+            maxindex = x;
+          }
+        }
+
+        /* set output to local max */
+        op[y] = maxval;
+        xp[y] = (real)maxindex;
+      }
+    }
+  }
+  else
+  {
+    /* number of batch frames */
+    long nbframe = input->size[0];
+    long i;
+
+    /* resize output */
+    THTensor_(resize3d)(output, nbframe, noframe, framesize);
+
+    /* indices will contain index locations for each output point */
+    THIndexTensor_(resize3d)(indices, nbframe, noframe, framesize);
+
+    /* get raw pointers */
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+    for(i = 0; i < nbframe; i++)
+    {
+      real *inputSample_data = input_data + i*niframe*framesize;
+      real *outputSample_data = output_data + i*noframe*framesize;
+      THIndex_t *indicesSample_data = indices_data + i*noframe*framesize;
+
+      for(t = 0; t < noframe; t++)
+      {
+        real *ip = inputSample_data + t*framesize*dW;
+        real *op = outputSample_data + t*framesize;
+        THIndex_t *xp = indicesSample_data + t*framesize;
+
+#pragma omp parallel for private(y)
+        for(y = 0; y < framesize; y++)
+        {
+          /* compute local max: */
+          long maxindex = -1;
+          real maxval = -THInf;
+          long x;
+          for(x = 0; x < kW; x++)
+          {
+            real val = ip[x*framesize+y];
+            if (val > maxval)
+            {
+              maxval = val;
+              maxindex = x;
+            }
+          }
+
+          /* set output to local max */
+          op[y] = maxval;
+          xp[y] = (real)maxindex;
+        }
+      }
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+
+}
+
+void THNN_(TemporalMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kW,
+          int dW)
+{
+  long niframe;
+  int noframe;
+  long framesize;
+
+  real *gradInput_data;
+  real *gradOutput_data;
+  THIndex_t *indices_data;
+
+  long t, y;
+
+  THNN_(TemporalMaxPooling_shapeCheck)(state, input, gradOutput, indices, kW, dW);
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize and zero */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+
+  if (input->nDimension == 3)
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+  /* sizes */
+  niframe = input->size[dimS];
+  noframe = gradOutput->size[dimS];
+  framesize = gradOutput->size[dimF];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THIndexTensor_(data)(indices);
+
+  if (input->nDimension == 2)
+  {
+    for(t = 0; t < noframe; t++)
+    {
+      real *gip = gradInput_data + t*framesize*dW;
+      real *gop = gradOutput_data + t*framesize;
+      THIndex_t *xp = indices_data + t*framesize;
+#pragma omp parallel for private(y)
+      for(y = 0; y < framesize; y++)
+      {
+        /* compute local max: */
+        long maxindex = (long)xp[y];
+	if (maxindex != -1)
+	  gip[maxindex*framesize+y] += gop[y];
+      }
+    }
+  }
+  else
+  {
+    /* number of batch frames */
+    long nbframe = input->size[0];
+    long i;
+
+    for(i = 0; i < nbframe; i++)
+    {
+      real *gradInputSample_data = gradInput_data + i*niframe*framesize;
+      real *gradOutputSample_data = gradOutput_data + i*noframe*framesize;
+      THIndex_t *indicesSample_data = indices_data + i*noframe*framesize;
+
+      for(t = 0; t < noframe; t++)
+      {
+        real *gip = gradInputSample_data + t*framesize*dW;
+        real *gop = gradOutputSample_data + t*framesize;
+        THIndex_t *xp = indicesSample_data + t*framesize;
+#pragma omp parallel for private(y)
+        for(y = 0; y < framesize; y++)
+        {
+          /* compute local max: */
+          long maxindex = (long)xp[y];
+	  if (maxindex != -1)
+	    gip[maxindex*framesize+y] += gop[y];
+        }
+      }
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/TemporalRowConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/TemporalRowConvolution.c
new file mode 100644
index 000000000..e3ae41e22
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/TemporalRowConvolution.c
@@ -0,0 +1,472 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalRowConvolution.c"
+#else
+
+static inline void THNN_(TemporalRowConvolution_shapeCheck)(
+	THNNState *state,
+	THTensor *input,
+	THTensor *gradOutput,
+	THTensor *weight,
+	THTensor *bias,
+	int kW,
+	int dW,
+	int padW) {
+
+	THArgCheck(kW > 0, 5,
+	           "kernel size should be greater than zero, but got kW: %d", kW);
+	THArgCheck(dW > 0, 6,
+	           "stride should be greater than zero, but got dW: %d", dW);
+	THNN_ARGCHECK(weight->nDimension == 3, 3, weight,
+	              "3D weight tensor expected, but got: %s");
+    THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+    THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous");
+
+	if (bias != NULL) {
+		THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+	}
+
+	// we're always looking at (possibly batch) x feats x seq
+	int ndim = input->nDimension;
+	int dimF = 0;
+	int dimS = 1;
+
+	if (ndim == 3) {
+		++dimS;
+		++dimF;
+	}
+
+	THNN_ARGCHECK(ndim == 2 || ndim == 3, 1, input,
+	              "2D or 3D (batch mode) input tensor expected, but got :%s");
+
+	long inputFrameSize = weight->size[0];
+	long nInputFrame = input->size[dimS];
+	long nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
+
+	if (nOutputFrame < 1) {
+		THError("Given input size: (%d x %d). "
+		        "Calculated output size: (%d x %d). Output size is too small",
+		        inputFrameSize, nInputFrame, inputFrameSize, nOutputFrame);
+	}
+
+	THNN_CHECK_DIM_SIZE(input, ndim, dimF, inputFrameSize);
+
+	if (gradOutput != NULL) {
+		THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimF, inputFrameSize);
+		THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimS, nOutputFrame);
+	}
+}
+
+static void THNN_(unfolded_acc_row)(
+	THTensor *finput,
+	THTensor *input,
+	int kW,
+	int dW,
+	int padW,
+	long inputFrameSize,
+	long nInputFrame,
+	long nOutputFrame) {
+
+	size_t c;
+	real *input_data = THTensor_(data)(input);
+	real *finput_data = THTensor_(data)(finput);
+
+// #pragma omp parallel for private(c)
+	for (c = 0; c < inputFrameSize; c++) {
+		size_t kw, x;
+		long long ix = 0;
+
+		for (kw = 0; kw < kW; kw++) {
+			real *src = finput_data
+			            + c * (kW * nOutputFrame)
+			            + kw * (nOutputFrame);
+			real *dst = input_data + c * (nInputFrame);
+
+			ix = (long long)(kw);
+			if (dW == 1) {
+			  real *dst_slice = dst + (size_t)(ix);
+			  THVector_(cadd)(dst_slice, dst_slice, src, 1, nOutputFrame);
+			} else {
+				for (x = 0; x < nOutputFrame; x++) {
+				  real *dst_slice = dst + (size_t)(ix + x * dW);
+				  THVector_(cadd)(dst_slice, dst_slice,
+						  src + (size_t)(x), 1, 1);
+				}
+			}
+		}
+	}
+}
+
+static void THNN_(unfolded_copy_row)(
+	THTensor *finput,
+	THTensor *input,
+	int kW,
+	int dW,
+	int padW,
+	long inputFrameSize,
+	long nInputFrame,
+	long nOutputFrame) {
+
+	long k;
+	real *input_data = THTensor_(data)(input);
+	real *finput_data = THTensor_(data)(finput);
+
+// #pragma omp parallel for private(k)
+	for (k = 0; k < inputFrameSize * kW; k++) {
+		size_t c = k / kW;
+		size_t rest = k % kW;
+		size_t kw = rest % kW;
+		size_t x;
+		long long ix;
+		real *dst = finput_data + c * (kW * nOutputFrame) + kw * (nOutputFrame);
+		real *src = input_data + c * (nInputFrame);
+
+		ix = (long long)(kw);
+		if (dW == 1) {
+			memcpy(dst, src+(size_t)(ix), sizeof(real) * (nOutputFrame));
+		} else {
+			for (x = 0; x < nOutputFrame; x++) {
+				memcpy(dst + (size_t)(x), src + (size_t)(ix + x * dW),
+				       sizeof(real) * 1);
+			}
+		}
+	}
+}
+
+static void THNN_(TemporalRowConvolution_updateOutput_frame)(
+	THTensor *input,
+	THTensor *output,
+	THTensor *weight,
+	THTensor *bias,
+	THTensor *finput,
+	int kW,
+	int dW,
+	int padW,
+	long inputFrameSize,
+	long nInputFrame,
+	long nOutputFrame) {
+
+	long i;
+
+	THTensor *output3d = THTensor_(newWithStorage3d)(
+		output->storage, output->storageOffset,
+		inputFrameSize, -1,
+		1, -1,
+		nOutputFrame, -1);
+
+	THNN_(unfolded_copy_row)(finput, input, kW, dW, padW,
+	                         inputFrameSize, nInputFrame, nOutputFrame);
+
+	THTensor_(zero)(output);
+
+	if (bias != NULL) {
+		for (i = 0; i < inputFrameSize; i++)
+			THVector_(fill)
+			        (output->storage->data + output->storageOffset
+			        + output->stride[0] * i,
+			        THTensor_(get1d)(bias, i), nOutputFrame);
+	}
+
+	THTensor_(baddbmm)(output3d, 1, output3d, 1, weight, finput);
+
+	THTensor_(free)(output3d);
+}
+
+void THNN_(TemporalRowConvolution_updateOutput)(
+	THNNState *state,
+	THTensor *input,
+	THTensor *output,
+	THTensor *weight,
+	THTensor *bias,
+	THTensor *finput,
+	THTensor *fgradInput,     // unused here but needed for Cuda
+	int kW,
+	int dW,
+	int padW,
+	bool featFirst) {
+
+	int ndim = input->nDimension;
+
+	THTensor *tinput;
+	if (!featFirst) {
+		tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
+		input = THTensor_(newContiguous)(tinput);
+	} else {
+		input = THTensor_(newContiguous)(input);
+	}
+
+	THNN_(TemporalRowConvolution_shapeCheck)(
+		state, input, NULL, weight, bias, kW, dW, padW);
+
+	long inputFrameSize = weight->size[0];
+	long nInputFrame = input->size[ndim - 1];
+	long nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
+
+	if (ndim == 2) { /* non-batch mode */
+
+		THTensor_(resize3d)(finput, inputFrameSize, kW, nOutputFrame);
+		THTensor_(resize2d)(output, inputFrameSize, nOutputFrame);
+
+		THTensor_(zero)(finput);
+		THTensor_(zero)(output);
+
+		THNN_(TemporalRowConvolution_updateOutput_frame)
+		        (input, output, weight, bias, finput,
+		        kW, dW, padW,
+		        inputFrameSize, nInputFrame, nOutputFrame);
+
+	} else {
+		long T = input->size[0];
+		long t;
+
+		THTensor_(resize4d)(finput, T, inputFrameSize, kW, nOutputFrame);
+		THTensor_(resize3d)(output, T, inputFrameSize, nOutputFrame);
+
+		THTensor_(zero)(finput);
+		THTensor_(zero)(output);
+
+#pragma omp parallel for private(t)
+		for (t = 0; t < T; t++) {
+			THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+			THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+			THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+			THNN_(TemporalRowConvolution_updateOutput_frame)
+			        (input_t, output_t, weight, bias, finput_t,
+			        kW, dW, padW, inputFrameSize, nInputFrame, nOutputFrame);
+
+			THTensor_(free)(input_t);
+			THTensor_(free)(output_t);
+			THTensor_(free)(finput_t);
+		}
+	}
+
+	if (!featFirst) { // NOTE: output will NOT be contiguous in this case
+		THTensor_(transpose)(output, output, ndim - 1, ndim - 2);
+		THTensor_(free)(tinput);
+	}
+
+	THTensor_(free)(input);
+}
+
+static void THNN_(TemporalRowConvolution_updateGradInput_frame)(
+	THTensor *gradInput,
+	THTensor *gradOutput,
+	THTensor *weight,
+	THTensor *fgradInput,
+	int kW,
+	int dW,
+	int padW,
+	long inputFrameSize,
+	long nInputFrame,
+	long nOutputFrame) {
+
+	THTensor *gradOutput3d = THTensor_(newWithStorage3d)(
+		gradOutput->storage, gradOutput->storageOffset,
+		inputFrameSize, -1,
+		1, -1,
+		nOutputFrame, -1);
+
+	// weight:			inputFrameSize x kW x 1
+	// gradOutput3d:	inputFrameSize x 1 x nOutputFrame
+	THTensor_(baddbmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput3d);
+	// fgradInput:		inputFrameSize x kW x nOutputFrame
+	THTensor_(free)(gradOutput3d);
+
+	THTensor_(zero)(gradInput);
+
+	THNN_(unfolded_acc_row)(fgradInput, gradInput,
+	                        kW, dW, padW,
+	                        inputFrameSize, nInputFrame, nOutputFrame);
+}
+
+void THNN_(TemporalRowConvolution_updateGradInput)(
+	THNNState *state,
+	THTensor *input,
+	THTensor *gradOutput,
+	THTensor *gradInput,
+	THTensor *weight,
+	THTensor *finput,
+	THTensor *fgradInput,
+	int kW,
+	int dW,
+	int padW,
+	bool featFirst) {
+
+	int ndim = input->nDimension;
+
+	THTensor *tinput, *tgradOutput;
+
+	if (!featFirst) {
+		tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
+		tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2);
+
+		input = THTensor_(newContiguous)(tinput);
+		gradOutput = THTensor_(newContiguous)(tgradOutput);
+
+	} else {
+		input = THTensor_(newContiguous)(input);
+		gradOutput = THTensor_(newContiguous)(gradOutput);
+	}
+
+	THNN_(TemporalRowConvolution_shapeCheck)(state, input, gradOutput, weight,
+	                                         NULL, kW, dW, padW);
+
+	long inputFrameSize = weight->size[0];
+	long nInputFrame = input->size[ndim - 1];
+	long nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
+
+	THTensor_(resizeAs)(fgradInput, finput);
+	THTensor_(resizeAs)(gradInput, input);
+
+	THTensor_(zero)(fgradInput);
+	THTensor_(zero)(gradInput);
+
+    THTensor *tweight = THTensor_(new)();
+    THTensor_(transpose)(tweight, weight, 1, 2);
+
+	if (ndim == 2) {
+		THNN_(TemporalRowConvolution_updateGradInput_frame)
+		        (gradInput, gradOutput, tweight, fgradInput,
+		        kW, dW, padW,
+		        inputFrameSize, nInputFrame, nOutputFrame);
+	} else {
+		long T = input->size[0];
+		long t;
+
+#pragma omp parallel for private(t)
+		for (t = 0; t < T; t++) {
+
+			THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+			THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+			THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+			THNN_(TemporalRowConvolution_updateGradInput_frame)
+			        (gradInput_t, gradOutput_t, tweight, fgradInput_t,
+			        kW, dW, padW,
+			        inputFrameSize, nInputFrame, nOutputFrame);
+
+			THTensor_(free)(gradInput_t);
+			THTensor_(free)(gradOutput_t);
+			THTensor_(free)(fgradInput_t);
+		}
+	}
+
+    THTensor_(free)(tweight);
+
+	if (!featFirst) { // NOTE: gradInput will NOT be contiguous in this case
+
+		THTensor_(free)(tinput);
+		THTensor_(free)(tgradOutput);
+
+		THTensor_(transpose)(gradInput, gradInput, ndim - 1, ndim - 2);
+	}
+
+	THTensor_(free)(input);
+	THTensor_(free)(gradOutput);
+
+}
+
+static void THNN_(TemporalRowConvolution_accGradParameters_frame)(
+	THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias,
+	THTensor *finput, real scale) {
+
+	long i;
+	THTensor *gradOutput3d = THTensor_(newWithStorage3d)(
+		gradOutput->storage, gradOutput->storageOffset,
+		gradOutput->size[0], -1,
+		1, -1,
+		gradOutput->size[1], -1);
+
+    THTensor *tfinput = THTensor_(new)();
+	THTensor_(transpose)(tfinput, finput, 1, 2);
+	// gradOutput3d:	inputFrameSize x 1 x nOutputFrame
+	// finput:			inputFrameSize x nOutputFrame x kW
+	THTensor_(baddbmm)(gradWeight, 1, gradWeight, scale, gradOutput3d, tfinput);
+	// gradWeight:		inputFrameSize x 1 x kW
+    THTensor_(free)(tfinput);
+
+	if (gradBias != NULL) {
+		for (i = 0; i < gradBias->size[0]; i++) {
+			long k;
+			real sum = 0;
+			real *data = gradOutput3d->storage->data
+			             + gradOutput3d->storageOffset
+			             + i * gradOutput3d->stride[0];
+			for (k = 0; k < gradOutput3d->size[2]; k++) {
+				sum += data[k];
+			}
+			(gradBias->storage->data + gradBias->storageOffset)[i]
+			        += scale * sum;
+		}
+	}
+
+	THTensor_(free)(gradOutput3d);
+
+}
+
+void THNN_(TemporalRowConvolution_accGradParameters)(
+	THNNState *state,
+	THTensor *input,
+	THTensor *gradOutput,
+	THTensor *gradWeight,
+	THTensor *gradBias,
+	THTensor *finput,
+	THTensor *fgradInput,
+	int kW,
+	int dW,
+	int padW,
+	bool featFirst,
+	accreal scale_) {
+
+    real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+	int ndim = input->nDimension;
+
+	THTensor *tinput, *tgradOutput;
+
+	if (!featFirst) {
+		tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
+		tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2);
+
+		input = THTensor_(newContiguous)(tinput);
+		gradOutput = THTensor_(newContiguous)(tgradOutput);
+	} else {
+		input = THTensor_(newContiguous)(input);
+		gradOutput = THTensor_(newContiguous)(gradOutput);
+	}
+
+	THNN_(TemporalRowConvolution_shapeCheck)
+	        (state, input, gradOutput, gradWeight, gradBias, kW, dW, padW);
+
+	long inputFrameSize = gradWeight->size[0];
+	long nInputFrame = input->size[ndim - 1];
+	long nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
+
+	if (ndim == 2) {
+		THNN_(TemporalRowConvolution_accGradParameters_frame)(
+			gradOutput, gradWeight, gradBias, finput, scale);
+	} else {
+		long T = input->size[0];
+		long t;
+
+		for (t = 0; t < T; t++) {
+			THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+			THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+			THNN_(TemporalRowConvolution_accGradParameters_frame)(
+				gradOutput_t, gradWeight, gradBias, finput_t, scale);
+
+			THTensor_(free)(gradOutput_t);
+			THTensor_(free)(finput_t);
+		}
+	}
+
+	if (!featFirst) {
+		THTensor_(free)(tinput);
+		THTensor_(free)(tgradOutput);
+	}
+
+	THTensor_(free)(input);
+	THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/TemporalSubSampling.c b/contrib/lua-torch/nn/lib/THNN/generic/TemporalSubSampling.c
new file mode 100644
index 000000000..68f35e28a
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/TemporalSubSampling.c
@@ -0,0 +1,156 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalSubSampling.c"
+#else
+
+static inline void THNN_(TemporalSubSampling_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         int kW,
+                         int dW,
+                         int *inputFrameSize) {
+  int nInputFrame, nOutputFrame;
+
+  THArgCheck(kW > 0, 6,
+             "kernel size should be greater than zero, but got kW: %d", kW);
+  THArgCheck(dW > 0, 7,
+             "stride should be greater than zero, but got dW: %d", dW);
+
+  THNN_ARGCHECK(input->nDimension == 2, 2, input,
+                  "2D or 3D (batch mode) tensor expected for input, but got: %s");
+  if (inputFrameSize != NULL) {
+    THArgCheck( input->size[1] == *inputFrameSize, 2,
+                "invalid input frame size.  Got: %d, Expected: %d",
+                input->size[1], *inputFrameSize);
+  }
+  THArgCheck( input->size[0] >= kW, 2,
+              "input sequence smaller than kernel size.  Got %d, Expected: %d",
+              input->size[0], kW);
+
+  nInputFrame = input->size[0];
+  nOutputFrame = (nInputFrame - kW) / dW + 1;
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, input->nDimension, 0, nOutputFrame);
+    if (inputFrameSize != NULL) {
+      THNN_CHECK_DIM_SIZE(gradOutput, input->nDimension, 1, *inputFrameSize);
+    }
+  }
+}
+
+void THNN_(TemporalSubSampling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW,
+          int dW,
+          int inputFrameSize)
+{
+  THTensor *outputFrame, *inputWindow;
+  int nInputFrame, nOutputFrame;
+  long k;
+
+  THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+  THArgCheck(!bias || THTensor_(isContiguous)(bias), 4, "bias must be contiguous");
+  THNN_(TemporalSubSampling_shapeCheck)(state, input, NULL, kW, dW, &inputFrameSize);
+
+  outputFrame = THTensor_(new)();
+  inputWindow = THTensor_(new)();
+
+  nInputFrame = input->size[0];
+  nOutputFrame = (nInputFrame - kW) / dW + 1;
+
+  THTensor_(resize2d)(output,
+                      nOutputFrame,
+                      inputFrameSize);
+
+  for(k = 0; k < nOutputFrame; k++)
+  {
+    THTensor_(narrow)(inputWindow, input, 0, k*dW, kW);
+    THTensor_(select)(outputFrame, output, 0, k);
+    THTensor_(sum)(outputFrame, inputWindow, 0, 1);
+    THTensor_(cmul)(outputFrame, outputFrame, weight);
+    THTensor_(cadd)(outputFrame, outputFrame, 1, bias);
+  }
+
+  THTensor_(free)(outputFrame);
+  THTensor_(free)(inputWindow);
+}
+
+void THNN_(TemporalSubSampling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW,
+          int dW)
+{
+
+  THTensor *gradOutputFrame;
+  THTensor *gradInputWindow, *buffer, *kwunit;
+  long k;
+
+  THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+  THNN_(TemporalSubSampling_shapeCheck)(state, input, gradOutput, kW, dW, NULL);
+
+  gradOutputFrame = THTensor_(new)();
+  gradInputWindow = THTensor_(new)();
+  buffer = THTensor_(new)();
+  kwunit = THTensor_(newWithSize1d)(kW);
+
+  THTensor_(fill)(kwunit, 1);
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  for(k = 0; k < gradOutput->size[0]; k++)
+  {
+    THTensor_(narrow)(gradInputWindow, gradInput, 0, k*dW, kW);
+    THTensor_(select)(gradOutputFrame, gradOutput, 0, k);
+    THTensor_(cmul)(buffer, weight, gradOutputFrame);
+    THTensor_(addr)(gradInputWindow, 1, gradInputWindow, 1, kwunit, buffer);
+  }
+
+  THTensor_(free)(gradOutputFrame);
+  THTensor_(free)(gradInputWindow);
+  THTensor_(free)(buffer);
+  THTensor_(free)(kwunit);
+}
+
+void THNN_(TemporalSubSampling_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW,
+          int dW,
+          accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  THTensor *gradOutputFrame;
+  THTensor *inputWindow, *buffer;
+  long k;
+
+  THNN_(TemporalSubSampling_shapeCheck)(state, input, gradOutput, kW, dW, NULL);
+  gradOutputFrame = THTensor_(new)();
+  inputWindow = THTensor_(new)();
+  buffer = THTensor_(new)();
+
+  for(k = 0; k < gradOutput->size[0]; k++)
+  {
+    THTensor_(narrow)(inputWindow, input, 0, k*dW, kW);
+    THTensor_(select)(gradOutputFrame, gradOutput, 0, k);
+    THTensor_(sum)(buffer, inputWindow, 0, 1);
+    THTensor_(addcmul)(gradWeight, gradWeight, scale, buffer, gradOutputFrame);
+    THTensor_(cadd)(gradBias, gradBias, scale, gradOutputFrame);
+  }
+
+  THTensor_(free)(gradOutputFrame);
+  THTensor_(free)(inputWindow);
+  THTensor_(free)(buffer);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/Threshold.c b/contrib/lua-torch/nn/lib/THNN/generic/Threshold.c
new file mode 100644
index 000000000..949c7a07c
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/Threshold.c
@@ -0,0 +1,64 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Threshold.c"
+#else
+
+void THNN_(Threshold_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal threshold_,
+          accreal val_,
+          bool inplace)
+{
+  real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_);
+  real val = TH_CONVERT_ACCREAL_TO_REAL(val_);
+  if (inplace)
+  {
+    TH_TENSOR_APPLY(real, input,
+      if (*input_data <= threshold)
+        *input_data = val;
+    );
+    THTensor_(set)(output, input);
+  }
+  else
+  {
+    THTensor_(resizeAs)(output, input);
+    TH_TENSOR_APPLY2(real, output, real, input,
+      *output_data = (*input_data > threshold) ? *input_data : val;
+    );
+  }
+}
+
+void THNN_(Threshold_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          accreal threshold_,
+          accreal val_,
+          bool inplace)
+{
+  real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_);
+  real val = TH_CONVERT_ACCREAL_TO_REAL(val_);
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  if (inplace)
+  {
+    TH_TENSOR_APPLY2(real, gradOutput, real, input,
+      if ((*input_data) <= threshold)
+        *gradOutput_data = 0;
+    );
+    THTensor_(set)(gradInput, gradOutput);
+  }
+  else
+  {
+    THTensor_(resizeAs)(gradInput, input);
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+      if ((*input_data) > threshold)
+        *gradInput_data = *gradOutput_data;
+      else
+        *gradInput_data = 0;
+    );
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricAveragePooling.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricAveragePooling.c
new file mode 100644
index 000000000..91c870e6f
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricAveragePooling.c
@@ -0,0 +1,373 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricAveragePooling.c"
+#else
+
+static inline void THNN_(VolumetricAveragePooling_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         int kT,
+                         int kW,
+                         int kH,
+                         int dT,
+                         int dW,
+                         int dH) {
+  long nslices;
+  long itime;
+  long iheight;
+  long iwidth;
+  long otime;
+  long oheight;
+  long owidth;
+  int ndim = input->nDimension;
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (input->nDimension == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 5,
+             "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d",
+             kT, kH, kW);
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 8,
+             "stride should be greater than zero, but got dT: %d dH: %d dW: %d",
+             dT, dH, dW);
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+                "4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH
+             && input->size[dimt] >= kT, 2,
+             "input image (T: %d H: %d W: %d) smaller than "
+             "kernel size (kT: %d kH: %d kW: %d)",
+             input->size[dimt], input->size[dimh], input->size[dimw],
+             kT, kH, kW);
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime   = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth  = input->size[dimw];
+  otime   = (itime   - kT) / dT + 1;
+  oheight = (iheight - kH) / dH + 1;
+  owidth  = (iwidth  - kW) / dW + 1;
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimN, nslices);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, otime);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, oheight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, owidth);
+  }
+}
+
+static void THNN_(VolumetricAveragePooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          long nslices,
+          long itime,
+          long iwidth,
+          long iheight,
+          long otime,
+          long owidth,
+          long oheight,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    long i, j, ti;
+    for (ti = 0; ti < otime; ti++)
+    {
+      for (i = 0; i < oheight; i++)
+      {
+        for (j = 0; j < owidth; j++)
+        {
+          /* local pointers */
+          real *ip = input_p + k * itime * iwidth * iheight
+            + ti * iwidth * iheight * dT +  i * iwidth * dH + j * dW;
+          real *op = output_p + k * otime * owidth * oheight
+            + ti * owidth * oheight + i * owidth + j;
+
+          /* compute local sum: */
+          real sum = 0.0;
+          int x, y, z;
+
+          for (z=0; z < kT; z++)
+          {
+            for (y = 0; y < kH; y++)
+            {
+              for (x = 0; x < kW; x++)
+              {
+                sum +=  *(ip + z * iwidth * iheight + y * iwidth + x);
+              }
+            }
+          }
+
+          /* set output to local max */
+          *op = sum / (kT * kW * kH);
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH)
+{
+  long nslices;
+  long itime;
+  long iheight;
+  long iwidth;
+  long otime;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+
+  THNN_(VolumetricAveragePooling_shapeCheck)(
+        state, input, NULL, kT, kW, kH,
+        dT, dW, dH);
+
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (input->nDimension == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime   = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth  = input->size[dimw];
+  otime   = (itime   - kT) / dT + 1;
+  oheight = (iheight - kH) / dH + 1;
+  owidth  = (iwidth  - kW) / dW + 1;
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (input->nDimension == 4) /* non-batch mode */
+  {
+    /* resize output */
+    THTensor_(resize4d)(output, nslices, otime, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(VolumetricAveragePooling_updateOutput_frame)(
+      input_data, output_data, nslices,
+      itime, iwidth, iheight,
+      otime, owidth, oheight,
+      kT, kW, kH,
+      dT, dW, dH
+    );
+  }
+  else  /* batch mode */
+  {
+    long p;
+    long nBatch = input->size[0];
+
+    long istride = nslices * itime * iwidth * iheight;
+    long ostride = nslices * otime * owidth * oheight;
+
+    /* resize output */
+    THTensor_(resize5d)(output, nBatch, nslices, otime, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p=0; p < nBatch; p++)
+    {
+      THNN_(VolumetricAveragePooling_updateOutput_frame)(
+        input_data + p * istride, output_data + p * ostride, nslices,
+        itime, iwidth, iheight,
+        otime, owidth, oheight,
+        kT, kW, kH,
+        dT, dW, dH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(VolumetricAveragePooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          long nslices,
+          long itime,
+          long iwidth,
+          long iheight,
+          long otime,
+          long owidth,
+          long oheight,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    long i, j, ti;
+    for (ti = 0; ti < otime; ti++)
+    {
+      for (i = 0; i < oheight; i++)
+      {
+        for (j = 0; j < owidth; j++)
+        {
+          /* local pointers */
+          real *ip = gradInput_p + k * itime * iwidth * iheight
+            + ti * iwidth * iheight * dT +  i * iwidth * dH + j * dW;
+          real *op = gradOutput_p + k * otime * owidth * oheight
+            + ti * owidth * oheight + i * owidth + j;
+
+          /* scatter gradients out to footprint: */
+          real val  = *op / (kT * kW * kH);
+          int x,y,z;
+          for (z=0; z < kT; z++)
+          {
+            for (y = 0; y < kH; y++)
+            {
+              for (x = 0; x < kW; x++)
+              {
+                *(ip + z * iwidth * iheight + y * iwidth + x) += val;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH)
+{
+  int nslices;
+  int itime;
+  int iheight;
+  int iwidth;
+  int otime;
+  int oheight;
+  int owidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  THNN_(VolumetricAveragePooling_shapeCheck)(
+        state, input, gradOutput, kT, kW, kH,
+        dT, dW, dH);
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  otime = gradOutput->size[dimt];
+  oheight = gradOutput->size[dimh];
+  owidth = gradOutput->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+  /* backprop */
+  if (input->nDimension == 4) /* non-batch mode*/
+  {
+    THNN_(VolumetricAveragePooling_updateGradInput_frame)(
+      gradInput_data, gradOutput_data, nslices,
+      itime, iwidth, iheight,
+      otime, owidth, oheight,
+      kT, kW, kH,
+      dT, dW, dH
+    );
+  }
+  else /* batch mode */
+  {
+    long p;
+    long nBatch = input->size[0];
+
+    long istride = nslices * itime * iwidth * iheight;
+    long ostride = nslices * otime * owidth * oheight;
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nBatch; p++)
+    {
+      THNN_(VolumetricAveragePooling_updateGradInput_frame)(
+        gradInput_data  + p * istride, gradOutput_data + p * ostride, nslices,
+        itime, iwidth, iheight,
+        otime, owidth, oheight,
+        kT, kW, kH,
+        dT, dW, dH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricConvolution.c
new file mode 100644
index 000000000..be1aa82e6
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricConvolution.c
@@ -0,0 +1,260 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricConvolution.c"
+#else
+
+void THNN_(VolumetricConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,     // only used by cuda impl
+          THTensor *fgradInput, // only used by cuda impl
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend");   // sharing signature with CUDA version
+
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+		"4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (input->nDimension == 5)
+  {
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  long nOutputPlane = weight->size[0];
+  long kT           = weight->size[2];
+  long kH           = weight->size[3];
+  long kW           = weight->size[4];
+  long inputDepth   = input->size[dimt];
+  long inputHeight  = input->size[dimh];
+  long inputWidth   = input->size[dimw];
+  long outputDepth  = (inputDepth - kT) / dT + 1;
+  long outputWidth  = (inputWidth - kW) / dW + 1;
+  long outputHeight = (inputHeight - kH) / dH + 1;
+  THTensor *outn = THTensor_(new)();
+  long i, j;
+  if (input->nDimension == 4) /* non-batch mode */
+  {
+    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+    /* add bias */
+    if (bias) {
+      for (i = 0; i < bias->size[0]; i++)
+      {
+        THTensor_(select)(outn, output, 0, i);
+        THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
+      }
+    } else {
+      THTensor_(zero)(output);
+    }
+
+    /* do convolutions */
+    THTensor_(conv3Dmv)(output, 1.0, 1.0, input, weight, dT, dH, dW, "V", "X");
+  }
+  else /* batch mode */
+  {
+    long nBatch = input->size[0];
+    THTensor_(resize5d)(output, nBatch, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor *inb = THTensor_(new)();
+    THTensor *outb = THTensor_(new)();
+
+    /* loop over batches */
+    for (j = 0; j < nBatch; j++)
+    {
+      THTensor_(select)(inb, input, 0, j);
+      THTensor_(select)(outb, output, 0, j);
+
+      /* add bias */
+      if (bias) {
+        for (i = 0; i < bias->size[0]; i++)
+        {
+          THTensor_(select)(outn, outb, 0, i);
+          THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
+        }
+      } else {
+        THTensor_(zero)(outb);
+      }
+
+      /* do convolutions */
+      THTensor_(conv3Dmv)(outb, 1.0, 1.0, inb, weight, dT, dH, dW, "V", "X");
+    }
+
+    THTensor_(free)(inb);
+    THTensor_(free)(outb);
+  }
+  THTensor_(free)(outn);
+}
+
+void THNN_(VolumetricConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput, // only used by cuda impl
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend");   // sharing signature with CUDA version
+
+  THNN_ARGCHECK(weight->nDimension == 5, 4, weight,
+		"5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+		"expected for weight, but got: %s");
+
+  int nOutputPlane = (int)weight->size[0];
+
+  THNN_ARGCHECK(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3,
+		gradOutput,
+		"4D or 5D (batch mode) tensor expected for gradOutput, but got: %s");
+
+  int dimPlane = 0;
+  if (gradOutput->nDimension == 5)
+  {
+    dimPlane++;
+  }
+
+  THArgCheck(nOutputPlane == gradOutput->size[dimPlane], 1,
+    "Number of output features is not equal to nOutputPlane"
+  );
+
+  /* gradient to input */
+  THTensor *tweight = THTensor_(newTranspose)(weight, 0, 1);
+  if (gradOutput->nDimension == 4) /* non-batch mode */
+  {
+    THTensor_(conv3Dmv)(gradInput, 0.0, 1.0, gradOutput, tweight, dT, dH, dW, "F", "C");
+  }
+  else /* batch mode */
+  {
+    long nBatch = gradOutput->size[0];
+    THTensor *ginpb = THTensor_(new)();
+    THTensor *goutb = THTensor_(new)();
+    long j;
+
+    THTensor_(resize5d)(gradInput,
+      input->size[0], input->size[1], input->size[2], input->size[3], input->size[4]
+    );
+
+    /* loop over batches */
+    for (j = 0; j < nBatch; j++)
+    {
+      THTensor_(select)(ginpb, gradInput, 0, j);
+      THTensor_(select)(goutb, gradOutput, 0, j);
+      THTensor_(conv3Dmv)(ginpb, 0.0, 1.0, goutb, tweight, dT, dH, dW, "F", "C");
+    }
+    THTensor_(free)(ginpb);
+    THTensor_(free)(goutb);
+  }
+
+  THTensor_(free)(tweight);
+}
+
+void THNN_(VolumetricConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,     // only used by cuda impl
+          THTensor *fgradInput, // only used by cuda impl
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend");   // sharing signature with CUDA version
+
+  THNN_ARGCHECK(gradWeight->nDimension == 5, 4, gradWeight,
+		"5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+		"expected for gradWeight, but got: %s");
+
+  int nOutputPlane = (int)gradWeight->size[0];
+  if (gradBias) {
+    THArgCheck(gradBias->nDimension == 1 && gradBias->size[0] == nOutputPlane, 5,
+      "gradBias tensor has wrong size"
+    );
+  }
+
+  long k;
+  real *gradBias_data;
+  THTensor *gradOutSlice;
+  int dimPlane = 0;
+  if (gradOutput->nDimension == 5)
+  {
+    dimPlane++;
+  }
+
+  THArgCheck(nOutputPlane == gradOutput->size[dimPlane], 1,
+    "Number of output features is not equal to nOutputPlane"
+  );
+
+  if (gradOutput->nDimension == 4) /* non-batch mode */
+  {
+    /* gradient to bias */
+    if (gradBias) {
+      gradBias_data = THTensor_(data)(gradBias);
+      gradOutSlice = THTensor_(new)();
+      for (k = 0; k < nOutputPlane; k++)
+      {
+        THTensor_(select)(gradOutSlice, gradOutput, 0, k);
+        gradBias_data[k] += scale * THTensor_(sumall)(gradOutSlice);
+      }
+      THTensor_(free)(gradOutSlice);
+    }
+
+    /* gradient to kernels */
+    THTensor_(conv3DRevger)(gradWeight, 1.0, scale, input, gradOutput, dT, dH, dW);
+  }
+  else /* batch mode */
+  {
+    long nBatch = gradOutput->size[0];
+    THTensor *inpb = THTensor_(new)();
+    THTensor *goutb = THTensor_(new)();
+    long j;
+
+    /* loop over batches */
+    for (j = 0; j < nBatch; j++)
+    {
+      THTensor_(select)(inpb, input, 0, j);
+      THTensor_(select)(goutb, gradOutput, 0, j);
+
+      /* gradient to bias */
+      if (gradBias) {
+        gradBias_data = THTensor_(data)(gradBias);
+        gradOutSlice = THTensor_(new)();
+        for (k = 0; k < nOutputPlane; k++)
+        {
+          THTensor_(select)(gradOutSlice, goutb, 0, k);
+          gradBias_data[k] += scale * THTensor_(sumall)(gradOutSlice);
+        }
+        THTensor_(free)(gradOutSlice);
+      }
+
+      /* gradient to kernels */
+      THTensor_(conv3DRevger)(gradWeight, 1.0, scale, inpb, goutb, dT, dH, dW);
+    }
+    THTensor_(free)(inpb);
+    THTensor_(free)(goutb);
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricConvolutionMM.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricConvolutionMM.c
new file mode 100644
index 000000000..00a121db6
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricConvolutionMM.c
@@ -0,0 +1,628 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricConvolutionMM.c"
+#else
+
+static void inline THNN_(VolumetricConvolutionMM_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         THTensor *weight,
+                         THTensor *bias,
+                         int kT,
+                         int kW,
+                         int kH,
+                         int dT,
+                         int dW,
+                         int dH,
+                         int pT,
+                         int pW,
+                         int pH) {
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+                "4D or 5D (batch mode) tensor expected for input, but got: %s");
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8,
+             "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d", kT, kH, kW);
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 11,
+             "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW);
+
+  int ndim = input->nDimension;
+  int dimf = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (ndim == 5)
+  {
+    dimf++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  long nInputPlane;
+  long inputDepth;
+  long inputHeight;
+  long inputWidth;
+  long nOutputPlane;
+  long outputDepth;
+  long outputHeight;
+  long outputWidth;
+
+  nInputPlane = input->size[dimf];
+  inputDepth = input->size[dimt];
+  inputHeight  = input->size[dimh];
+  inputWidth   = input->size[dimw];
+  nOutputPlane = weight->size[0];
+  outputDepth  = (inputDepth + 2*pT - kT) / dT + 1;
+  outputHeight = (inputHeight + 2*pH - kH) / dH + 1;
+  outputWidth  = (inputWidth + 2*pW - kW) / dW + 1;
+
+  if (outputWidth < 1 || outputHeight < 1 || outputDepth < 1)
+  {
+    THError(
+      "Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+      nInputPlane, inputDepth, inputHeight, inputWidth,
+      nOutputPlane, outputDepth, outputHeight, outputWidth
+    );
+  }
+
+  THArgCheck(weight->nDimension == 2 || weight->nDimension == 5, 4,
+             "weight tensor should be 2D or 5D - got %d", weight->nDimension);
+
+  if (bias != NULL) {
+    THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+  }
+
+  THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, outputDepth);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+static int THNN_(view_weight)(THTensor **_weight)
+{
+  THTensor *weight = *_weight;
+  if (weight->nDimension == 5) {
+    long s1 = weight->size[0];
+    long s2 = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+    *_weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset, s1, -1, s2, -1);
+    return 1;
+  }
+  return 0;
+}
+
+/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
+static void THNN_(unfolded_acc_vol)(
+          THTensor *finput,
+          THTensor *input,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int nInputPlane,
+          int inputDepth,
+          int inputWidth,
+          int inputHeight,
+          int outputDepth,
+          int outputWidth,
+          int outputHeight)
+{
+  int nip;
+  real *input_data = THTensor_(data)(input);
+  real *finput_data = THTensor_(data)(finput);
+
+//#pragma omp parallel for private(nip)
+  for (nip = 0; nip < nInputPlane; nip++)
+  {
+    int kt, kw, kh, t, y, x, it, ix, iy;
+    for (kt = 0; kt < kT; kt++)
+    {
+      for (kh = 0; kh < kH; kh++)
+      {
+        for (kw = 0; kw < kW; kw++)
+        {
+          real *src = finput_data
+            + nip * (kT*kH*kW*outputDepth*outputHeight*outputWidth)
+            + kt  * (kH*kW*outputDepth*outputHeight*outputWidth)
+            + kh  * (kW*outputDepth*outputHeight*outputWidth)
+            + kw  * (outputDepth*outputHeight*outputWidth);
+
+          real *dst = input_data + nip*(inputDepth*inputHeight*inputWidth);
+          if (pT > 0 || pH > 0 || pW > 0)
+          {
+            for (t = 0; t < outputDepth; t++)
+            {
+              it = t*dT - pT + kt;
+              for (y = 0; y < outputHeight; y++)
+              {
+                iy = y*dH - pH + kh;
+                for (x = 0; x < outputWidth; x++)
+                {
+                  ix = x*dW - pW + kw;
+                  if (it < 0 || it >= inputDepth || iy < 0 || iy >= inputHeight || ix < 0 || ix >= inputWidth)
+                  {
+                  }
+                  else
+                  {
+                    real *dst_slice = dst+it*inputHeight*inputWidth+iy*inputWidth+ix;
+                    THVector_(cadd)(dst_slice, dst_slice, src+t*outputHeight*outputWidth+y*outputWidth+x, 1, 1);
+                  }
+                }
+              }
+            }
+          }
+          else
+          {
+            for (t = 0; t < outputDepth; t++)
+            {
+              it = t*dT + kt;
+              for (y = 0; y < outputHeight; y++)
+              {
+                iy = y*dH + kh;
+                for(x = 0; x < outputWidth; x++)
+                {
+                  ix = x*dW + kw;
+                  real *dst_slice = dst+it*inputHeight*inputWidth+iy*inputWidth+ix;
+                  THVector_(cadd)(dst_slice, dst_slice, src+t*outputHeight*outputWidth+y*outputWidth+x, 1, 1);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static void THNN_(unfolded_copy_vol)(
+          THTensor *finput,
+          THTensor *input,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int nInputPlane,
+          int inputDepth,
+          int inputWidth,
+          int inputHeight,
+          int outputDepth,
+          int outputWidth,
+          int outputHeight)
+{
+  long k;
+  real *input_data = THTensor_(data)(input);
+  real *finput_data = THTensor_(data)(finput);
+// #pragma omp parallel for private(k)
+  for (k = 0; k < nInputPlane*kT*kH*kW; k++)
+  {
+    int nip = k / (kT*kH*kW);
+    int rest = k % (kT*kH*kW);
+    int kt = rest / (kH*kW);
+    rest = rest % (kH*kW);
+    int kh = rest / kW;
+    int kw = rest % kW;
+    int t,x,y,it,ix,iy;
+    real *dst = finput_data
+      + nip * (kT*kH*kW*outputDepth*outputHeight*outputWidth)
+      + kt  * (kH*kW*outputDepth*outputHeight*outputWidth)
+      + kh  * (kW*outputDepth*outputHeight*outputWidth)
+      + kw  * (outputDepth*outputHeight*outputWidth);
+    real *src = input_data + nip*(inputDepth*inputHeight*inputWidth);
+
+    if (pT > 0 || pH > 0 || pW > 0)
+    {
+      for (t = 0; t < outputDepth; t++)
+      {
+        it = t*dT - pT + kt;
+        for (y = 0; y < outputHeight; y++)
+        {
+          iy = y*dH - pH + kh;
+          for (x = 0; x < outputWidth; x++)
+          {
+            ix = x*dW - pW + kw;
+            if (it < 0 || it >= inputDepth || iy < 0 || iy >= inputHeight || ix < 0 || ix >= inputWidth)
+              memset(dst+t*outputHeight*outputWidth+y*outputWidth+x, 0, sizeof(real)*(1));
+            else
+              memcpy(dst+t*outputHeight*outputWidth+y*outputWidth+x, src+it*inputHeight*inputWidth+iy*inputWidth+ix, sizeof(real)*(1));
+          }
+        }
+      }
+    }
+    else
+    {
+      for (t = 0; t < outputDepth; t++)
+      {
+        it = t*dT + kt;
+        for (y = 0; y < outputHeight; y++)
+        {
+          iy = y*dH + kh;
+          for(x = 0; x < outputWidth; x++)
+          {
+            ix = x*dW + kw;
+            memcpy(dst+t*outputHeight*outputWidth+y*outputWidth+x, src+it*inputHeight*inputWidth+iy*inputWidth+ix, sizeof(real)*(1));
+          }
+        }
+      }
+    }
+  }
+}
+
+static void THNN_(VolumetricConvolutionMM_updateOutput_frame)(
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          long nInputPlane,
+          long inputDepth,
+          long inputWidth,
+          long inputHeight,
+          long nOutputPlane,
+          long outputDepth,
+          long outputWidth,
+          long outputHeight)
+{
+  long i;
+  THTensor *output2d;
+
+  THNN_(unfolded_copy_vol)(
+    finput, input,
+    kT, kW, kH,
+    dT, dW, dH,
+    pT, pW, pH,
+    nInputPlane,
+    inputDepth, inputWidth, inputHeight,
+    outputDepth, outputWidth, outputHeight
+  );
+
+  output2d = THTensor_(newWithStorage2d)(
+    output->storage, output->storageOffset, nOutputPlane, -1,
+    outputDepth*outputHeight*outputWidth, -1
+  );
+
+  if (bias) {
+      for (i = 0; i < nOutputPlane; i++)
+      {
+        THVector_(fill)(
+          output->storage->data+output->storageOffset+output->stride[0]*i,
+          THTensor_(get1d)(bias, i),
+          outputDepth*outputHeight*outputWidth
+        );
+      }
+  } else {
+    THTensor_(zero)(output);
+  }
+
+  THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput);
+
+  THTensor_(free)(output2d);
+}
+
+void THNN_(VolumetricConvolutionMM_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  int dimf = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+  int freeWeight = 0;
+
+  long nInputPlane;
+  long inputDepth;
+  long inputHeight;
+  long inputWidth;
+  long nOutputPlane;
+  long outputDepth;
+  long outputHeight;
+  long outputWidth;
+
+  THNN_(VolumetricConvolutionMM_shapeCheck)(
+        state, input, NULL, weight, bias,
+        kT, kW, kH, dT, dW, dH, pT, pW, pH);
+  input = THTensor_(newContiguous)(input);
+
+  if (input->nDimension == 5)
+  {
+    dimf++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  nInputPlane = input->size[dimf];
+  inputDepth = input->size[dimt];
+  inputHeight  = input->size[dimh];
+  inputWidth   = input->size[dimw];
+  nOutputPlane = weight->size[0];
+  outputDepth  = (inputDepth + 2*pT - kT) / dT + 1;
+  outputHeight = (inputHeight + 2*pH - kH) / dH + 1;
+  outputWidth  = (inputWidth + 2*pW - kW) / dW + 1;
+
+  freeWeight = THNN_(view_weight)(&weight);
+
+  if (input->nDimension == 4)
+  {
+    THTensor_(resize2d)(finput, kT*kW*kH*nInputPlane, outputDepth*outputHeight*outputWidth);
+    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+    THNN_(VolumetricConvolutionMM_updateOutput_frame)(
+      input, output, weight, bias, finput,
+      kT, kW, kH,
+      dT, dW, dH,
+      pT, pW, pH,
+      nInputPlane, inputDepth, inputWidth, inputHeight,
+      nOutputPlane, outputDepth, outputWidth, outputHeight
+    );
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+    THTensor_(resize3d)(finput, T, kT*kW*kH*nInputPlane, outputDepth*outputHeight*outputWidth);
+    THTensor_(resize5d)(output, T, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+// #pragma omp parallel for private(t)
+    for (t = 0; t < T; t++)
+    {
+      THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+      THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      THNN_(VolumetricConvolutionMM_updateOutput_frame)(
+        input_t, output_t, weight, bias, finput_t,
+        kT, kW, kH,
+        dT, dW, dH,
+        pT, pW, pH,
+        nInputPlane, inputDepth, inputWidth, inputHeight,
+        nOutputPlane, outputDepth, outputWidth, outputHeight
+      );
+
+      THTensor_(free)(input_t);
+      THTensor_(free)(output_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+
+  THTensor_(free)(input);
+  if (freeWeight)
+    THTensor_(free)(weight);
+}
+
+static void THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
+          THTensor *gradInput,
+          THTensor *gradOutput,
+          THTensor *weight,
+          THTensor *fgradInput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)(
+    gradOutput->storage, gradOutput->storageOffset,
+    gradOutput->size[0], -1,
+    gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1
+  );
+
+  THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d);
+  THTensor_(free)(gradOutput2d);
+
+  THTensor_(zero)(gradInput);
+
+  THNN_(unfolded_acc_vol)(
+    fgradInput, gradInput,
+    kT, kW, kH,
+    dT, dW, dH,
+    pT, pW, pH,
+    gradInput->size[0], gradInput->size[1], gradInput->size[3], gradInput->size[2],
+    gradOutput->size[1], gradOutput->size[3], gradOutput->size[2]
+  );
+}
+
+void THNN_(VolumetricConvolutionMM_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  int nOutputPlane = (int)weight->size[0];
+
+  THNN_(VolumetricConvolutionMM_shapeCheck)(
+        state, input, gradOutput, weight, NULL,
+        kT, kW, kH, dT, dW, dH, pT, pW, pH);
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  int freeWeight = THNN_(view_weight)(&weight);
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(resizeAs)(fgradInput, finput);
+  // depending on the BLAS library, fgradInput (result tensor) might
+  // be left uninitialized on zero alpha, which might lead to weird behavior
+  // hence, to be safe, zero it
+  THTensor_(zero)(fgradInput);
+  THTensor *tweight = THTensor_(new)();
+  THTensor_(transpose)(tweight, weight, 0, 1);
+
+  if (input->nDimension == 4)
+  {
+    THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
+      gradInput, gradOutput, tweight, fgradInput,
+      kT, kW, kH,
+      dT, dW, dH,
+      pT, pW, pH
+    );
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+//#pragma omp parallel for private(t)
+    for (t = 0; t < T; t++)
+    {
+      THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+      THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
+        gradInput_t, gradOutput_t, tweight, fgradInput_t,
+        kT, kW, kH,
+        dT, dW, dH,
+        pT, pW, pH
+      );
+
+      THTensor_(free)(gradInput_t);
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(fgradInput_t);
+    }
+  }
+
+  THTensor_(free)(tweight);
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  if (freeWeight)
+    THTensor_(free)(weight);
+}
+
+static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)(
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          real scale)
+{
+  long i;
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)(
+    gradOutput->storage, gradOutput->storageOffset,
+    gradOutput->size[0], -1,
+    gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1
+  );
+
+  THTensor *tfinput = THTensor_(new)();
+  THTensor_(transpose)(tfinput, finput, 0, 1);
+  THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, tfinput);
+  THTensor_(free)(tfinput);
+
+  if (gradBias) {
+    for (i = 0; i < gradBias->size[0]; i++)
+    {
+      long k;
+      real sum = 0;
+      real *data = gradOutput2d->storage->data + gradOutput2d->storageOffset + i*gradOutput2d->stride[0];
+      for (k = 0; k < gradOutput2d->size[1]; k++)
+        sum += data[k];
+
+      (gradBias->storage->data + gradBias->storageOffset)[i] += scale * sum;
+    }
+  }
+
+  THTensor_(free)(gradOutput2d);
+}
+
+void THNN_(VolumetricConvolutionMM_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  int freeWeight;
+  int nOutputPlane = (int)gradWeight->size[0];
+
+  THNN_(VolumetricConvolutionMM_shapeCheck)(
+        state, input, gradOutput, gradWeight, gradBias,
+        kT, kW, kH, dT, dW, dH, pT, pW, pH);
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  freeWeight = THNN_(view_weight)(&gradWeight);
+
+  if (input->nDimension == 4)   // non-batch mode
+  {
+    THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale);
+  }
+  else  // batch mode
+  {
+    long T = input->size[0];
+    long t;
+
+    for (t = 0; t < T; t++)
+    {
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale);
+
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  if (freeWeight)
+    THTensor_(free)(gradWeight);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricDilatedConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricDilatedConvolution.c
new file mode 100644
index 000000000..ca740f78e
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricDilatedConvolution.c
@@ -0,0 +1,420 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricDilatedConvolution.c"
+#else
+
+static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)(
+                         THTensor *input, THTensor *gradOutput,
+                         THTensor *weight, THTensor *bias,
+                         int kT, int kH, int kW, int dT, int dH, int dW,
+                         int padT, int padH, int padW,
+                         int dilationT, int dilationH, int dilationW) {
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+                "4D or 5D (batch mode) tensor expected for input, but got: %s");
+  THNN_ARGCHECK(weight->nDimension == 5, 4, weight,
+                "5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+                "expected for weight, but got: %s");
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8,
+             "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d", kT, kH, kW);
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 11,
+             "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW);
+  THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 15,
+             "dilation should be greater than zero, but got dilationT: %d, dilationH: %d, dilationW: %d",
+             dilationT, dilationH, dilationW);
+  if (bias != NULL) {
+    THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+  }
+
+  // Params
+  int ndim = input->nDimension;
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+  int dimf = 0;
+  int dimd = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (ndim == 5) {
+    dimf++;
+    dimd++;
+    dimh++;
+    dimw++;
+  }
+
+  long inputDepth  = input->size[dimd];
+  long inputHeight  = input->size[dimh];
+  long inputWidth   = input->size[dimw];
+  long outputDepth  = (inputDepth  + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth  = (inputWidth  + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+            nInputPlane,inputDepth,inputHeight,inputWidth,nOutputPlane,outputDepth,outputHeight,outputWidth);
+
+  THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimd, outputDepth);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+void THNN_(VolumetricDilatedConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *columns,
+          THTensor *ones,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH)
+{
+  THNN_(VolumetricDilatedConvolution_shapeCheck)(
+        input, NULL, weight, bias,
+        kT, kH, kW, dT, dH, dW, padT, padH, padW,
+        dilationT, dilationH, dilationW);
+
+  // Params:
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  input = THTensor_(newContiguous)(input);
+  weight = THTensor_(newContiguous)(weight);
+  bias = bias ? THTensor_(newContiguous)(bias) : bias;
+  int batch = 1;
+  if (input->nDimension == 4) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+  }
+
+  long inputDepth  = input->size[2];
+  long inputHeight  = input->size[3];
+  long inputWidth   = input->size[4];
+  long outputDepth  = (inputDepth  + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth  = (inputWidth  + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
+  THTensor_(zero)(output);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->nDimension != 3 ||
+      ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *output_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(output_n, output, 0, elt);
+
+    // Do Bias first:
+    // M,N,K are dims of matrix A and B
+    long m_ = nOutputPlane;
+    long n_ = outputDepth * outputHeight * outputWidth;
+    long k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    if (bias) {
+      THBlas_(gemm)(
+        't', 'n',
+        n_, m_, k_,
+        1,
+        THTensor_(data)(ones), k_,
+        THTensor_(data)(bias), k_,
+        0,
+        THTensor_(data)(output_n), n_
+      );
+    } else {
+      THTensor_(zero)(output_n);
+    }
+
+    // Extract columns:
+    THNN_(vol2col)(
+      THTensor_(data)(input_n),
+      nInputPlane, inputDepth, inputHeight, inputWidth,
+      kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      dilationT, dilationH, dilationW,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    long m = nOutputPlane;
+    long n = columns->size[1];
+    long k = nInputPlane*kT*kH*kW;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+      'n', 'n',
+      n, m, k,
+      1,
+      THTensor_(data)(columns), n,
+      THTensor_(data)(weight), k,
+      1,
+      THTensor_(data)(output_n), n
+    );
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(output_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(weight);
+  if (bias) THTensor_(free)(bias);
+}
+
+void THNN_(VolumetricDilatedConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradColumns,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH)
+{
+  THNN_(VolumetricDilatedConvolution_shapeCheck)(
+        input, gradOutput, weight, NULL,
+        kT, kH, kW, dT, dH, dW, padT, padH, padW,
+        dilationT, dilationH, dilationW);
+
+  // Params
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  weight = THTensor_(newContiguous)(weight);
+
+  int batch = 1;
+  if (input->nDimension == 4) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  long inputDepth  = input->size[2];
+  long inputWidth   = input->size[4];
+  long inputHeight  = input->size[3];
+  long outputDepth  = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(gradColumns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+  THTensor_(zero)(gradColumns);
+
+  // Helpers
+  THTensor *gradInput_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THTensor_(select)(gradInput_n, gradInput, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    long m = nInputPlane*kT*kW*kH;
+    long n = gradColumns->size[1];
+    long k = nOutputPlane;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        'n', 't',
+        n, m, k,
+        1,
+        THTensor_(data)(gradOutput_n), n,
+        THTensor_(data)(weight), m,
+        0,
+        THTensor_(data)(gradColumns), n
+    );
+
+    // Unpack columns back into input:
+    THNN_(col2vol)(
+      THTensor_(data)(gradColumns),
+      nInputPlane, inputDepth, inputHeight, inputWidth,
+      kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      dilationT, dilationH, dilationW,
+      THTensor_(data)(gradInput_n)
+    );
+  }
+
+  // Free
+  THTensor_(free)(gradInput_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+    THTensor_(resize4d)(gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
+}
+
+void THNN_(VolumetricDilatedConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *columns,
+          THTensor *ones,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH,
+          accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  THNN_(VolumetricDilatedConvolution_shapeCheck)(
+        input, gradOutput, gradWeight, gradBias,
+        kT, kH, kW, dT, dH, dW, padT, padH, padW,
+        dilationT, dilationH, dilationW);
+
+  // Params
+  int nInputPlane = gradWeight->size[1];
+  int nOutputPlane = gradWeight->size[0];
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  int batch = 1;
+  if (input->nDimension == 4) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  long inputDepth  = input->size[2];
+  long inputWidth   = input->size[4];
+  long inputHeight  = input->size[3];
+  long outputDepth  = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    THNN_(vol2col)(
+      THTensor_(data)(input_n),
+      nInputPlane, inputDepth, inputHeight, inputWidth,
+      kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      dilationT, dilationH, dilationW,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    long m = nOutputPlane;
+    long n = nInputPlane*kT*kW*kH;
+    long k = columns->size[1];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        't', 'n',
+        n, m, k,
+        scale,
+        THTensor_(data)(columns), k,
+        THTensor_(data)(gradOutput_n), k,
+        1,
+        THTensor_(data)(gradWeight), n
+    );
+
+    // Do Bias:
+    // M,N,K are dims of matrix A and B
+    long m_ = nOutputPlane;
+    long k_ = outputDepth * outputHeight * outputWidth;
+
+    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+    if (gradBias) {
+      THBlas_(gemv)(
+          't',
+          k_, m_,
+          scale,
+          THTensor_(data)(gradOutput_n), k_,
+          THTensor_(data)(ones), 1,
+          1,
+          THTensor_(data)(gradBias), 1
+      );
+    }
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize
+  if (batch == 0) {
+    THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricDilatedMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricDilatedMaxPooling.c
new file mode 100644
index 000000000..66c0f9531
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricDilatedMaxPooling.c
@@ -0,0 +1,515 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricDilatedMaxPooling.c"
+#else
+
+static inline void THNN_(VolumetricDilatedMaxPooling_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         THIndexTensor *indices,
+                         int kT, int kW, int kH,
+                         int dT, int dW, int dH,
+                         int pT, int pW, int pH,
+                         int dilationT, int dilationW, int dilationH,
+                         bool ceilMode) {
+  int ndim = input->nDimension;
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+  long nslices;
+  long itime;
+  long iheight;
+  long iwidth;
+  long otime;
+  long oheight;
+  long owidth;
+
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 5,
+             "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d",
+             kT, kH, kW);
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 8,
+             "stride should be greater than zero, but got dT: %d dH: %d dW: %d",
+             dT, dH, dW);
+  THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 14,
+             "dilation should be greater than 0, but got dilationT: %d dilationH: %d dilationW: %d",
+             dilationT, dilationH, dilationW);
+
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+                "4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+  if (input->nDimension == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  THArgCheck(kT/2 >= pT && kW/2 >= pW && kH/2 >= pH, 2,
+             "pad should be smaller than half of kernel size, but got "
+             "kT: %d kW: %d, kH: %d, padT: %d, padW: %d, padH: %d",
+             kT, kW, kH, pT, pW, pH);
+
+  nslices = input->size[dimN];
+  itime   = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth  = input->size[dimw];
+  if (ceilMode)
+  {
+    otime = (int)(ceil((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1;
+    oheight = (int)(ceil((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1;
+    owidth  = (int)(ceil((float)(iwidth  - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1;
+  }
+  else
+  {
+    otime = (int)(floor((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1;
+    oheight = (int)(floor((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1;
+    owidth  = (int)(floor((float)(iwidth  - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1;
+  }
+
+  if (pT || pW || pH)
+  {
+    // ensure that the last pooling starts inside the image
+    if ((otime - 1)*dT >= itime + pT)
+      --otime;
+    if ((oheight - 1)*dH >= iheight + pH)
+      --oheight;
+    if ((owidth  - 1)*dW >= iwidth  + pW)
+      --owidth;
+  }
+
+  if (otime < 1 || owidth < 1 || oheight < 1)
+    THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+            nslices,itime,iheight,iwidth,nslices,otime,oheight,owidth);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimN, nslices);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, otime);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, oheight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, owidth);
+  }
+  if (indices != NULL) {
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimN, nslices);
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimt, otime);
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimh, oheight);
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimw, owidth);
+  }
+}
+
+static void THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          THIndex_t *indz_p,
+          long nslices,
+          long itime,
+          long iwidth,
+          long iheight,
+          long otime,
+          long owidth,
+          long oheight,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int dilationT,
+          int dilationW,
+          int dilationH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    long i, j, ti;
+    for (ti = 0; ti < otime; ti++)
+    {
+      for (i = 0; i < oheight; i++)
+      {
+        for (j = 0; j < owidth; j++)
+        {
+          /* local pointers */
+
+          long start_t = ti * dT - pT;
+          long start_h = i * dH - pH;
+          long start_w = j * dW - pW;
+
+          long kernel_t = fminf(kT, kT + start_t);
+          long kernel_h = fminf(kH, kH + start_h);
+          long kernel_w = fminf(kW, kW + start_w);
+
+          while(start_t < 0)
+            start_t += dilationT;
+          while(start_h < 0)
+            start_h += dilationH;
+          while(start_w < 0)
+            start_w += dilationW;
+
+          real *ip = input_p + k * itime * iwidth * iheight
+            + start_t * iwidth * iheight + start_h * iwidth + start_w;
+          real *op = output_p + k * otime * owidth * oheight
+            + ti * owidth * oheight + i * owidth + j;
+          THIndex_t *indzp = indz_p + k * otime * owidth * oheight
+            + ti * owidth * oheight + i * owidth + j;
+
+	  /* compute local max: */
+	  real maxval = -THInf;
+	  int x,y,z;
+	  int mx, my, mz;
+	  mx = my = mz = -1;
+
+          for (z = 0; z < kernel_t; z++)
+          {
+            for (y = 0; y < kernel_h; y++)
+            {
+              for (x = 0; x < kernel_w; x++)
+              {
+                if ((start_t + z * dilationT < itime) && (start_h + y * dilationH < iheight) && (start_w + x * dilationW < iwidth))
+                {
+                  real val = *(ip + z * dilationT * iwidth * iheight + y * dilationH * iwidth + x * dilationW);
+                  if (val > maxval)
+                  {
+                    maxval = val;
+                    // Store indices w.r.t the kernel dimension
+                    mz = z + (kT - kernel_t);
+                    my = y + (kH - kernel_h);
+                    mx = x + (kW - kernel_w);
+                  }
+                }
+              }
+            }
+          }
+
+          // set max values
+          ((unsigned char*)(indzp))[0] = mz;
+          ((unsigned char*)(indzp))[1] = my;
+          ((unsigned char*)(indzp))[2] = mx;
+          ((unsigned char*)(indzp))[3] = 0;
+
+          /* set output to local max */
+          *op = maxval;
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int dilationT,
+          int dilationW,
+          int dilationH,
+          bool ceilMode)
+{
+  long nslices;
+  long itime;
+  long iheight;
+  long iwidth;
+  long otime;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+  THIndex_t *indices_data;
+
+
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (input->nDimension == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_(VolumetricDilatedMaxPooling_shapeCheck)(
+        state, input, NULL, NULL,
+        kT,  kW,  kH, dT,  dW,  dH,
+        pT,  pW,  pH, dilationT,  dilationW,  dilationH,
+        ceilMode);
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime   = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth  = input->size[dimw];
+  if (ceilMode)
+  {
+    otime = (int)(ceil((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1;
+    oheight = (int)(ceil((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1;
+    owidth  = (int)(ceil((float)(iwidth  - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1;
+  }
+  else
+  {
+    otime = (int)(floor((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1;
+    oheight = (int)(floor((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1;
+    owidth  = (int)(floor((float)(iwidth  - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1;
+  }
+
+  if (pT || pW || pH)
+  {
+    // ensure that the last pooling starts inside the image
+    if ((otime - 1)*dT >= itime + pT)
+      --otime;
+    if ((oheight - 1)*dH >= iheight + pH)
+      --oheight;
+    if ((owidth  - 1)*dW >= iwidth  + pW)
+      --owidth;
+  }
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (input->nDimension == 4) /* non-batch mode */
+  {
+    /* resize output */
+    THTensor_(resize4d)(output, nslices, otime, oheight, owidth);
+    /* indices will contain ti,i,j uchar locations packed into float/double */
+    THIndexTensor_(resize4d)(indices, nslices, otime, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+    THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)(
+      input_data, output_data,
+      indices_data,
+      nslices,
+      itime, iwidth, iheight,
+      otime, owidth, oheight,
+      kT, kW, kH,
+      dT, dW, dH,
+      pT, pW, pH,
+      dilationT, dilationW, dilationH
+    );
+  }
+  else /* batch mode */
+  {
+    long p;
+    long nBatch = input->size[0];
+
+    long istride = nslices * itime * iwidth * iheight;
+    long ostride = nslices * otime * owidth * oheight;
+
+    /* resize output */
+    THTensor_(resize5d)(output, nBatch, nslices, otime, oheight, owidth);
+    /* indices will contain ti,i,j locations for each output point */
+    THIndexTensor_(resize5d)(indices, nBatch, nslices, otime, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+#pragma omp parallel for private(p)
+    for (p=0; p < nBatch; p++)
+    {
+      THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)(
+        input_data   + p * istride,
+        output_data  + p * ostride,
+        indices_data + p * ostride,
+        nslices,
+        itime, iwidth, iheight,
+        otime, owidth, oheight,
+        kT, kW, kH,
+        dT, dW, dH,
+        pT, pW, pH,
+        dilationT, dilationW, dilationH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(VolumetricDilatedMaxPooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          THIndex_t *indz_p,
+          long nslices,
+          long itime,
+          long iwidth,
+          long iheight,
+          long otime,
+          long owidth,
+          long oheight,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int dilationT,
+          int dilationW,
+          int dilationH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *gradInput_p_k  = gradInput_p  + k * itime * iwidth * iheight;
+    real *gradOutput_p_k = gradOutput_p + k * otime * owidth * oheight;
+    THIndex_t *indz_p_k = indz_p + k * otime * owidth * oheight;
+
+    /* calculate max points */
+    long ti, i, j;
+    for (ti = 0; ti < otime; ti++)
+    {
+      for (i = 0; i < oheight; i++)
+      {
+        for (j = 0; j < owidth; j++)
+        {
+          /* retrieve position of max */
+          THIndex_t * indzp = &indz_p_k[ti * oheight * owidth + i * owidth + j];
+          long maxti = ((unsigned char*)(indzp))[0] * dilationT + ti * dT - pT;
+          long maxi  = ((unsigned char*)(indzp))[1] * dilationH + i * dH - pH;
+          long maxj  = ((unsigned char*)(indzp))[2] * dilationW + j * dW - pW;
+
+	  if (maxti != -1) {
+	    /* update gradient */
+	    gradInput_p_k[maxti * iheight * iwidth + maxi * iwidth + maxj] +=
+	      gradOutput_p_k[ti * oheight * owidth + i * owidth + j];
+	  }
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int dilationT,
+          int dilationW,
+          int dilationH,
+          bool ceilMode)
+{
+  int nslices;
+  int itime;
+  int iheight;
+  int iwidth;
+  int otime;
+  int oheight;
+  int owidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+  THIndex_t *indices_data;
+
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  THNN_(VolumetricDilatedMaxPooling_shapeCheck)(
+        state, input, gradOutput, indices,
+        kT,  kW,  kH, dT,  dW,  dH,
+        pT,  pW,  pH, dilationT,  dilationW,  dilationH,
+        ceilMode);
+
+  // TODO: gradOutput shape check
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  otime = gradOutput->size[dimt];
+  oheight = gradOutput->size[dimh];
+  owidth = gradOutput->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THIndexTensor_(data)(indices);
+
+  /* backprop */
+  if (input->nDimension == 4) /* non-batch mode*/
+  {
+    THNN_(VolumetricDilatedMaxPooling_updateGradInput_frame)(
+      gradInput_data, gradOutput_data,
+      indices_data,
+      nslices,
+      itime, iwidth, iheight,
+      otime, owidth, oheight,
+      dT, dW, dH,
+      pT, pW, pH,
+      dilationT, dilationW, dilationH
+    );
+  }
+  else /* batch mode */
+  {
+    long p;
+    long nBatch = input->size[0];
+
+    long istride = nslices * itime * iwidth * iheight;
+    long ostride = nslices * otime * owidth * oheight;
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nBatch; p++)
+    {
+      THNN_(VolumetricDilatedMaxPooling_updateGradInput_frame)(
+        gradInput_data + p * istride,
+        gradOutput_data + p * ostride,
+        indices_data + p * ostride,
+        nslices,
+        itime, iwidth, iheight,
+        otime, owidth, oheight,
+        dT, dW, dH,
+        pT, pW, pH,
+        dilationT, dilationW, dilationH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricFractionalMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricFractionalMaxPooling.c
new file mode 100644
index 000000000..236986bb9
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricFractionalMaxPooling.c
@@ -0,0 +1,279 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricFractionalMaxPooling.c"
+#else
+
+static long* THNN_(VolumetricFractionalMaxPooling_generateIntervals)(
+  real sample,
+  long inputSize,
+  long outputSize,
+  int poolSize) {
+  real alpha = (real) (inputSize - poolSize) / (real) (outputSize - 1);
+  long* sequence = (long*) THAlloc(sizeof(long) * outputSize);
+
+  long i;
+  for (i = 0; i < outputSize - 1; ++i) {
+    sequence[i] =
+      (long) ((i + sample) * alpha) - (long) (sample * alpha);
+  }
+  sequence[outputSize - 1] = inputSize - poolSize;
+
+  return sequence;
+}
+
+static void THNN_(VolumetricFractionalMaxPooling_updateOutput_frame)(
+  real* input,
+  real* output,
+  THIndex_t* indices,
+  real* randomSamples,
+  long numPlanes,
+  long inputT, long inputW, long inputH,
+  long outputT, long outputW, long outputH,
+  int poolSizeT, int poolSizeW, int poolSizeH) {
+  long plane;
+#pragma omp parallel for private(plane)
+  for (plane = 0; plane < numPlanes; ++plane) {
+    /* each plane contains 3 random samples, one for T, one for W, and one for H */
+    real* randomSamplesForPlane = randomSamples + plane * 3;
+
+    /* Generate interval sequence */
+    long* sequenceT =
+      THNN_(VolumetricFractionalMaxPooling_generateIntervals)(
+        randomSamplesForPlane[0], inputT, outputT, poolSizeT);
+    long* sequenceW =
+      THNN_(VolumetricFractionalMaxPooling_generateIntervals)(
+        randomSamplesForPlane[1], inputW, outputW, poolSizeW);
+    long* sequenceH =
+      THNN_(VolumetricFractionalMaxPooling_generateIntervals)(
+        randomSamplesForPlane[2], inputH, outputH, poolSizeH);
+
+    /* loop over output */
+    long h, w, t;
+
+    real* inputForPlane = input + plane * inputT * inputW * inputH;
+    real* outputForPlane = output + plane * outputT * outputW * outputH;
+    THIndex_t* indicesForPlane = indices + plane * outputT * outputW * outputH;
+
+    for (h = 0; h < outputH; ++h) {
+      long inputHStart = sequenceH[h];
+
+      for (w = 0; w < outputW; ++w) {
+        long inputWStart = sequenceW[w];
+
+        for (t = 0; t < outputT; ++t) {
+          long inputTStart = sequenceT[t];
+
+          real maxVal = -THInf;
+          long maxIndex = -1;
+
+          long h2, w2, t2;
+          for (h2 = inputHStart; h2 < inputHStart + poolSizeH; ++h2) {
+            for (w2 = inputWStart; w2 < inputWStart + poolSizeW; ++w2) {
+              for (t2 = inputTStart; t2 < inputTStart + poolSizeT; ++t2) {
+                THAssert(h2 >= 0 && h2 < inputH);
+                THAssert(w2 >= 0 && w2 < inputW);
+                THAssert(t2 >= 0 && t2 < inputT);
+
+                long planeIndex = h2 * inputW * inputT + w2 * inputT + t2;
+                real val = inputForPlane[planeIndex];
+                if (val > maxVal) {
+                  maxVal = val;
+                  maxIndex = planeIndex;
+                }
+              }
+            }
+          }
+
+          THAssert(maxVal != -THInf);
+          THAssert(maxIndex != -1);
+
+          outputForPlane[h * outputW * outputT + w * outputT + t] = maxVal;
+          /* +1 to lua index */
+          indicesForPlane[h * outputW * outputT + w * outputT + t] = maxIndex + TH_INDEX_BASE;
+        }
+      }
+    }
+
+    THFree(sequenceT);
+    THFree(sequenceW);
+    THFree(sequenceH);
+  }
+}
+
+void THNN_(VolumetricFractionalMaxPooling_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int outputT, int outputW, int outputH,
+    int poolSizeT, int poolSizeW, int poolSizeH,
+    THIndexTensor *indices,
+    THTensor *randomSamples) {
+
+  long numBatch = 1;
+  int planeDim = 0;
+  int heightDim = 1;
+  int widthDim = 2;
+  int timeDim = 3;
+
+  long numInputDims = THTensor_(nDimension)(input);
+  THNN_ARGCHECK(numInputDims == 4 || numInputDims == 5, 2, input,
+		"4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+  if (numInputDims == 5) {
+    numBatch = THTensor_(size)(input, 0);
+    planeDim++;
+    heightDim++;
+    widthDim++;
+    timeDim++;
+  }
+
+  /* sizes */
+  long numPlanes = THTensor_(size)(input, planeDim);
+  long inputH = THTensor_(size)(input, heightDim);
+  long inputW = THTensor_(size)(input, widthDim);
+  long inputT = THTensor_(size)(input, timeDim);
+
+  THArgCheck(outputH + poolSizeH - 1 < inputH, 9,
+             "poolSizeH (%d) too large relative to input height (%d)",
+	     poolSizeH, inputH);
+  THArgCheck(outputW + poolSizeW - 1 < inputW, 8,
+             "poolSizeW (%d) too large relative to input width (%d)",
+	     poolSizeW, inputW);
+  THArgCheck(outputT + poolSizeT - 1 < inputT, 7,
+             "poolSizeT (%d) too large relative to input time (%d)",
+	     poolSizeT, inputT);
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (numInputDims == 4) {
+    /* resize output */
+    THTensor_(resize4d)(output, numPlanes, outputH, outputW, outputT);
+    /* indices will contain the locations for each output point */
+    THIndexTensor_(resize4d)(indices, numPlanes, outputH, outputW, outputT);
+
+    THNN_(VolumetricFractionalMaxPooling_updateOutput_frame)(
+      THTensor_(data)(input),
+      THTensor_(data)(output),
+      THIndexTensor_(data)(indices),
+      THTensor_(data)(randomSamples),
+      numPlanes, inputT, inputW, inputH,
+      outputT, outputW, outputH, poolSizeT, poolSizeW, poolSizeH);
+  } else {
+    THTensor_(resize5d)(output, numBatch, numPlanes, outputH, outputW, outputT);
+    /* indices will contain the locations for each output point */
+    THIndexTensor_(resize5d)(indices, numBatch, numPlanes, outputH, outputW, outputT);
+
+    long batch;
+#pragma omp parallel for private(batch)
+    for (batch = 0; batch < numBatch; ++batch) {
+      THNN_(VolumetricFractionalMaxPooling_updateOutput_frame)(
+        THTensor_(data)(input) + batch * numPlanes * inputH * inputW * inputT,
+        THTensor_(data)(output) + batch * numPlanes * outputH * outputW * outputT,
+        THIndexTensor_(data)(indices) + batch * numPlanes * outputH * outputW * outputT,
+        THTensor_(data)(randomSamples) + batch * numPlanes * 3,
+        numPlanes, inputT, inputW, inputH,
+        outputT, outputW, outputH, poolSizeT, poolSizeW, poolSizeH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(VolumetricFractionalMaxPooling_updateGradInput_frame)(
+  real* gradInput,
+  real* gradOutput,
+  THIndex_t* indices,
+  long numPlanes,
+  long inputT, long inputW, long inputH,
+  long outputT, long outputW, long outputH) {
+  long plane;
+#pragma omp parallel for private(plane)
+  for (plane = 0; plane < numPlanes; plane++) {
+    real* gradInputForPlane = gradInput + plane * inputT * inputW * inputH;
+    real* gradOutputForPlane = gradOutput + plane * outputT * outputW * outputH;
+    THIndex_t* indicesForPlane = indices + plane * outputT * outputW * outputH;
+
+    long h, w, t;
+    for (h = 0; h < outputH; ++h) {
+      for (w = 0; w < outputW; ++w) {
+        for (t = 0; t < outputT; ++t) {
+          long outputIndex = h * outputW * outputT + w * outputT + t;
+          long index = indicesForPlane[outputIndex] - TH_INDEX_BASE;
+          THAssert(index >= 0 && index < inputT * inputW * inputH);
+
+          gradInputForPlane[index] += gradOutputForPlane[outputIndex];
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricFractionalMaxPooling_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int outputT, int outputW, int outputH,
+    int poolSizeT, int poolSizeW, int poolSizeH,
+    THIndexTensor *indices) {
+
+  long numBatch = 1;
+  int planeDim = 0;
+  int heightDim = 1;
+  int widthDim = 2;
+  int timeDim = 3;
+
+  long numInputDims = THTensor_(nDimension)(input);
+  if (numInputDims == 5) {
+    numBatch = THTensor_(size)(input, 0);
+    planeDim = 1;
+    heightDim++;
+    widthDim++;
+    timeDim++;
+  }
+
+  /* sizes */
+  long numPlanes = THTensor_(size)(input, planeDim);
+  long inputH = THTensor_(size)(input, heightDim);
+  long inputW = THTensor_(size)(input, widthDim);
+  long inputT = THTensor_(size)(input, timeDim);
+
+  THArgCheck(outputT == THTensor_(size)(gradOutput, timeDim), 3,
+             "gradOutput time unexpected");
+  THArgCheck(outputW == THTensor_(size)(gradOutput, widthDim), 3,
+             "gradOutput width unexpected");
+  THArgCheck(outputH == THTensor_(size)(gradOutput, heightDim), 3,
+             "gradOutput height unexpected");
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (numInputDims == 4) {
+    THNN_(VolumetricFractionalMaxPooling_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      THIndexTensor_(data)(indices),
+      numPlanes, inputT, inputW, inputH, outputT, outputW, outputH);
+  } else {
+    long batch;
+#pragma omp parallel for private(batch)
+    for (batch = 0; batch < numBatch; ++batch) {
+      THNN_(VolumetricFractionalMaxPooling_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + batch * numPlanes * inputH * inputW * inputT,
+        THTensor_(data)(gradOutput) + batch * numPlanes * outputH * outputW * outputT,
+        THIndexTensor_(data)(indices) + batch * numPlanes * outputH * outputW * outputT,
+        numPlanes, inputT, inputW, inputH, outputT, outputW, outputH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricFullConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricFullConvolution.c
new file mode 100644
index 000000000..c974fab50
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricFullConvolution.c
@@ -0,0 +1,541 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricFullConvolution.c"
+#else
+
+static void THNN_(vol2col)(
+  const real *data_vol, const int channels,
+  const int depth, const int height, const int width,
+  const int kT, const int kH, const int kW,
+  const int pT, const int pH, const int pW,
+  const int dT, const int dH, const int dW,
+  const int dilationT, const int dilationH, const int dilationW,
+  real *data_col)
+{
+  int c, t, h, w;
+  int depth_col  = (depth  + 2 * pT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  int height_col = (height + 2 * pH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  int width_col  = (width  + 2 * pW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  int channels_col = channels * kT * kH * kW;
+  for (c = 0; c < channels_col; ++c)
+  {
+    int w_offset = c % kW;
+    int h_offset = (c / kW) % kH;
+    int t_offset = (c / kW / kH) % kT;
+    int c_vol = c / kT / kH / kW;
+    for (t = 0; t < depth_col; ++t)
+    {
+      for (h = 0; h < height_col; ++h)
+      {
+        for (w = 0; w < width_col; ++w)
+        {
+          int t_pad = t * dT - pT + t_offset * dilationT;
+          int h_pad = h * dH - pH + h_offset * dilationH;
+          int w_pad = w * dW - pW + w_offset * dilationW;
+          if (t_pad >= 0 && t_pad < depth &&
+              h_pad >= 0 && h_pad < height &&
+              w_pad >= 0 && w_pad < width)
+            data_col[((c * depth_col + t) * height_col + h) * width_col + w] =
+              data_vol[((c_vol * depth + t_pad) * height + h_pad) * width + w_pad];
+          else
+            data_col[((c * depth_col + t) * height_col + h) * width_col + w] = 0;
+        }
+      }
+    }
+  }
+}
+
+static void THNN_(col2vol)(
+  const real* data_col, const int channels,
+  const int depth, const int height, const int width,
+  const int kT, const int kH, const int kW,
+  const int pT, const int pH, const int pW,
+  const int dT, const int dH, const int dW,
+  const int dilationT, const int dilationH, const int dilationW,
+  real* data_vol)
+{
+  int c, t, h, w;
+  memset(data_vol, 0, sizeof(real) * depth * height * width * channels);
+  int depth_col  = (depth  + 2 * pT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  int height_col = (height + 2 * pH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  int width_col  = (width  + 2 * pW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  int channels_col = channels * kT * kH * kW;
+  for (c = 0; c < channels_col; ++c)
+  {
+    int w_offset = c % kW;
+    int h_offset = (c / kW) % kH;
+    int t_offset = (c / kW / kH) % kT;
+    int c_vol = c / kT / kH / kW;
+    for (t = 0; t < depth_col; ++t)
+    {
+      for (h = 0; h < height_col; ++h)
+      {
+        for (w = 0; w < width_col; ++w)
+        {
+          int t_pad = t * dT - pT + t_offset * dilationT;
+          int h_pad = h * dH - pH + h_offset * dilationH;
+          int w_pad = w * dW - pW + w_offset * dilationW;
+          if (t_pad >= 0 && t_pad < depth &&
+              h_pad >= 0 && h_pad < height &&
+              w_pad >= 0 && w_pad < width)
+            data_vol[((c_vol * depth + t_pad) * height + h_pad) * width + w_pad] +=
+              data_col[((c * depth_col + t) * height_col + h) * width_col + w];
+        }
+      }
+    }
+  }
+}
+
+static inline void THNN_(VolumetricFullConvolution_shapeCheck)(
+                         THTensor *input, THTensor *gradOutput,
+                         THTensor *weight, THTensor *bias,
+                         int dT, int dW, int dH, int pT, int pW, int pH,
+                         int aT, int aW, int aH) {
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+                "4D or 5D (batch mode) tensor expected for input, but got: %s");
+  // number of input & output planes and kernel size is indirectly defined by the weight tensor
+  THNN_ARGCHECK(weight->nDimension == 5, 4, weight,
+                "5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+                "expected for weight, but got: %s");
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 11,
+             "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW);
+  THArgCheck(aT < dT && aW < dW && aH < dH, 15,
+             "output adjustment must be smaller than stride, but got "
+             "adjT: %d adjH: %d adjW: %d dT: %d dH: %d dW: %d",
+             aT, aH, aW, dT, dH, dW);
+
+  int ndim = input->nDimension;
+  const int nInputPlane  = (int)weight->size[0];
+  const int nOutputPlane = (int)weight->size[1];
+  const int kT           = (int)weight->size[2];
+  const int kH           = (int)weight->size[3];
+  const int kW           = (int)weight->size[4];
+
+  if (bias != NULL) {
+    THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[1]);
+  }
+
+  int dimf = 0;
+  int dimd = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (ndim == 5) {
+    dimf++;
+    dimd++;
+    dimh++;
+    dimw++;
+  }
+
+  const long inputWidth   = input->size[dimw];
+  const long inputHeight  = input->size[dimh];
+  const long inputDepth   = input->size[dimd];
+  const long outputWidth  = (inputWidth  - 1) * dW - 2*pW + kW + aW;
+  const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH;
+  const long outputDepth  = (inputDepth  - 1) * dT - 2*pT + kT + aT;
+
+  if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+            nInputPlane,inputDepth,inputHeight,inputWidth,nOutputPlane,outputDepth,outputHeight,outputWidth);
+
+  THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimd, outputDepth);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+void THNN_(VolumetricFullConvolution_updateOutput)(
+  THNNState *state,
+  THTensor *input,          // 4D or 5D (batch) tensor
+  THTensor *output,
+  THTensor *weight,         // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+  THTensor *bias,
+  THTensor *finput,         // internal columns buffer
+  THTensor *fgradInput,     // internal ones buffer
+  int dT, int dW, int dH,   // stride of the convolution
+  int pT, int pW, int pH,   // padding
+  int aT, int aW, int aH)   // extra output adjustment
+{
+  THTensor *columns = finput;
+  THTensor *ones    = fgradInput;
+
+  THNN_(VolumetricFullConvolution_shapeCheck)(
+        input, NULL, weight, bias,
+        dT, dW, dH, pT, pW, pH, aT, aW, aH);
+
+  const int nInputPlane  = (int)weight->size[0];
+  const int nOutputPlane = (int)weight->size[1];
+  const int kT           = (int)weight->size[2];
+  const int kH           = (int)weight->size[3];
+  const int kW           = (int)weight->size[4];
+
+  input = THTensor_(newContiguous)(input);
+  weight = THTensor_(newContiguous)(weight);
+  bias = bias ? THTensor_(newContiguous)(bias) : bias;
+  int batch = 1;
+  if (input->nDimension == 4)
+  {
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+  }
+
+  const long inputWidth   = input->size[4];
+  const long inputHeight  = input->size[3];
+  const long inputDepth   = input->size[2];
+  const long outputWidth  = (inputWidth  - 1) * dW - 2*pW + kW + aW;
+  const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH;
+  const long outputDepth  = (inputDepth  - 1) * dT - 2*pT + kT + aT;
+
+  // Batch size + input planes
+  const long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+  THTensor_(zero)(columns);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
+  {
+    // Resize plane and fill with ones...
+    THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *output_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; ++elt)
+  {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(output_n, output, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    const long m = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+    const long n = columns->size[1];
+    const long k = weight->size[0];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+      'n', 't',
+      n, m, k,
+      1,
+      THTensor_(data)(input_n), n,
+      THTensor_(data)(weight), m,
+      0,
+      THTensor_(data)(columns), n
+    );
+
+    // Unpack columns back into input:
+    THNN_(col2vol)(
+      THTensor_(data)(columns),
+      nOutputPlane, outputDepth, outputHeight, outputWidth,
+      kT, kH, kW,
+      pT, pH, pW,
+      dT, dH, dW,
+       1,  1,  1,
+      THTensor_(data)(output_n)
+    );
+
+    // Do Bias after:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    const long m_ = nOutputPlane;
+    const long n_ = outputDepth * outputHeight * outputWidth;
+    const long k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+	if (bias) {
+      THBlas_(gemm)(
+        't', 'n',
+        n_, m_, k_,
+        1,
+        THTensor_(data)(ones), k_,
+        THTensor_(data)(bias), k_,
+        1,
+        THTensor_(data)(output_n), n_
+      );
+    }
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(output_n);
+
+  // Resize output
+  if (batch == 0)
+  {
+    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(weight);
+  if (bias) THTensor_(free)(bias);
+}
+
+void THNN_(VolumetricFullConvolution_updateGradInput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradInput,
+  THTensor *weight,
+  THTensor *finput,
+  THTensor *fgradInput,     // only used by cuda impl
+  int dT, int dW, int dH,   // stride
+  int pT, int pW, int pH,   // padding
+  int aT, int aW, int aH)   // extra output adjustment
+{
+  THTensor *gradColumns = finput;
+
+  // number of input & output planes and kernel size is indirectly defined by the weight tensor
+  THNN_(VolumetricFullConvolution_shapeCheck)(
+        input, gradOutput, weight, NULL,
+        dT, dW, dH, pT, pW, pH, aT, aW, aH);
+
+  const int nInputPlane  = (int)weight->size[0];
+  const int nOutputPlane = (int)weight->size[1];
+  const int kT           = (int)weight->size[2];
+  const int kH           = (int)weight->size[3];
+  const int kW           = (int)weight->size[4];
+
+  input = THTensor_(newContiguous)(input);
+  weight = THTensor_(newContiguous)(weight);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  int batch = 1;
+  if (input->nDimension == 4)
+  {
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  const long inputWidth   = input->size[4];
+  const long inputHeight  = input->size[3];
+  const long inputDepth   = input->size[2];
+  const long outputWidth  = (inputWidth  - 1) * dW - 2*pW + kW + aW;
+  const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH;
+  const long outputDepth  = (inputDepth  - 1) * dT - 2*pT + kT + aT;
+
+  // Batch size + input planes
+  const long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
+  THTensor_(zero)(gradInput);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(gradColumns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+
+  // Helpers
+  THTensor *gradInput_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; ++elt)
+  {
+    // Matrix mulitply per sample:
+    THTensor_(select)(gradInput_n, gradInput, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    THNN_(vol2col)(
+      THTensor_(data)(gradOutput_n),
+      nOutputPlane, outputDepth, outputHeight, outputWidth,
+      kT, kH, kW,
+      pT, pH, pW,
+      dT, dH, dW,
+       1,  1,  1,
+      THTensor_(data)(gradColumns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    const long m = weight->size[0];
+    const long n = gradColumns->size[1];
+    const long k = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+      'n', 'n',
+      n, m, k,
+      1,
+      THTensor_(data)(gradColumns), n,
+      THTensor_(data)(weight), k,
+      0,
+      THTensor_(data)(gradInput_n), n
+    );
+  }
+
+  // Free
+  THTensor_(free)(gradInput_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize output
+  if (batch == 0)
+  {
+    THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+    THTensor_(resize4d)(gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
+}
+
+void THNN_(VolumetricFullConvolution_accGradParameters)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradWeight,
+  THTensor *gradBias,
+  THTensor *finput,
+  THTensor *fgradInput,
+  int dT, int dW, int dH,   // stride
+  int pT, int pW, int pH,   // padding
+  int aT, int aW, int aH,   // extra output adjustment
+  accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  // number of input & output planes and kernel size is indirectly defined by the gradWeight tensor
+  THNN_(VolumetricFullConvolution_shapeCheck)(
+        input, gradOutput, gradWeight, gradBias,
+        dT, dW, dH, pT, pW, pH, aT, aW, aH);
+
+  int nInputPlane  = (int)gradWeight->size[0];
+  int nOutputPlane = (int)gradWeight->size[1];
+  int kT           = (int)gradWeight->size[2];
+  int kH           = (int)gradWeight->size[3];
+  int kW           = (int)gradWeight->size[4];
+
+  THTensor *columns = finput;
+  THTensor *ones = fgradInput;
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+  if (gradBias)
+    THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
+
+  int batch = 1;
+  if (input->nDimension == 4)
+  {
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  const long inputWidth   = input->size[4];
+  const long inputHeight  = input->size[3];
+  const long inputDepth   = input->size[2];
+  const long outputWidth  = (inputWidth  - 1) * dW - 2*pW + kW + aW;
+  const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH;
+  const long outputDepth  = (inputDepth  - 1) * dT - 2*pT + kT + aT;
+
+  // Batch size + input planes
+  const long batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
+  {
+    // Resize plane and fill with ones...
+    THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; ++elt)
+  {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    THNN_(vol2col)(
+      THTensor_(data)(gradOutput_n), nOutputPlane,
+      outputDepth, outputHeight, outputWidth,
+      kT, kH, kW,
+      pT, pH, pW,
+      dT, dH, dW,
+       1,  1,  1,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    const long n = columns->size[0];   // nOutputPlane * kt * kh * kw
+    const long m = input_n->size[0];   // nInputPlane
+    const long k = columns->size[1];   // inputHeight * inputWidth
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+      't', 'n',
+      n, m, k,
+      scale,
+      THTensor_(data)(columns), k,
+      THTensor_(data)(input_n), k,
+      1,
+      THTensor_(data)(gradWeight), n
+    );
+
+    // Do Bias:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    const long m_ = nOutputPlane;
+    const long k_ = outputDepth * outputHeight * outputWidth;
+
+    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+    if (gradBias) {
+      THBlas_(gemv)(
+        't',
+        k_, m_,
+        scale,
+        THTensor_(data)(gradOutput_n), k_,
+        THTensor_(data)(ones), 1,
+        1,
+        THTensor_(data)(gradBias), 1
+      );
+    }
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize
+  if (batch == 0)
+  {
+    THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricMaxPooling.c
new file mode 100644
index 000000000..a3601e0b6
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricMaxPooling.c
@@ -0,0 +1,50 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricMaxPooling.c"
+#else
+
+void THNN_(VolumetricMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          bool ceilMode)
+{
+  THNN_(VolumetricDilatedMaxPooling_updateOutput)(
+          state, input, output, indices,
+          kT, kW, kH, dT, dW, dH,
+          pT, pW, pH, 1, 1, 1, ceilMode);
+}
+
+void THNN_(VolumetricMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          bool ceilMode)
+{
+  THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
+          state, input, gradOutput, gradInput, indices,
+          kT, kW, kH, dT, dW, dH,
+          pT, pW, pH, 1, 1, 1, ceilMode);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricMaxUnpooling.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricMaxUnpooling.c
new file mode 100644
index 000000000..d9d9e5951
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricMaxUnpooling.c
@@ -0,0 +1,373 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricMaxUnpooling.c"
+#else
+
+static inline void THNN_(VolumetricMaxUnpooling_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         THIndexTensor *indices,
+                         int oT,
+                         int oW,
+                         int oH,
+                         int dT,
+                         int dW,
+                         int dH,
+                         int pT,
+                         int pW,
+                         int pH)
+{
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+                "4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+  THNN_CHECK_SHAPE_INDICES(input, indices);
+
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 10,
+             "stride should be greater than zero, but got dT: %d dH: %d dW: %d",
+             dT, dH, dW);
+
+  int dimw = 3;
+  int dimh = 2;
+  int dimt = 1;
+  int dimn = 0;
+
+  if (input->nDimension == 5)
+  {
+    dimt++;
+    dimw++;
+    dimh++;
+    dimn++;
+  }
+  int nslices = input->size[dimn];
+
+  if (gradOutput != NULL) {
+    if (oT != gradOutput->size[dimt] || oW != gradOutput->size[dimw] || oH != gradOutput->size[dimh])
+    {
+      THError(
+        "Inconsistent gradOutput size. oT= %d, oH= %d, oW= %d, gradOutput: %dx%dx%d",
+        oT, oH, oW, gradOutput->size[dimt], gradOutput->size[dimh], gradOutput->size[dimw]
+      );
+    }
+
+    THNN_CHECK_DIM_SIZE(gradOutput, input->nDimension, dimn, nslices);
+  }
+}
+
+static void THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          THIndex_t *ind_p,
+          int nslices,
+          int iT,
+          int iW,
+          int iH,
+          int oT,
+          int oW,
+          int oH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  int k;
+  int has_error = 0;
+  THIndex_t error_index;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    int ti, i, j, maxz, maxy, maxx;
+    for (ti = 0; ti < iT; ti++)
+    {
+      for (i = 0; i < iH; i++)
+      {
+        for (j = 0; j < iW; j++)
+        {
+          int start_t = ti * dT - pT;
+          int start_h = i * dH - pH;
+          int start_w = j * dW - pW;
+
+          real *input_p_k = input_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
+          THIndex_t *ind_p_k = ind_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
+
+          maxz = ((unsigned char*)(ind_p_k))[0]; /* retrieve position of max */
+          maxy = ((unsigned char*)(ind_p_k))[1];
+          maxx = ((unsigned char*)(ind_p_k))[2];
+
+          THIndex_t idx = k*oT*oW*oH + oH*oW*(start_t+maxz) + oW*(start_h+maxy) + (start_w+maxx);
+          if (start_t+maxz<0 || start_h+maxy<0 || start_w+maxx<0 || start_t+maxz>=oT
+	      || start_h+maxy>=oH || start_w+maxx>=oW)
+          {
+#pragma omp critical
+            {
+              has_error = 1;
+              error_index = idx;
+            }
+          } else {
+            output_p[idx] = *input_p_k; /* update output */
+          }
+        }
+      }
+    }
+  }
+  if (has_error) {
+    THError(
+        "found an invalid max index %ld (output volumes are of size %dx%dx%d)",
+        error_index, oT, oH, oW
+    );
+  }
+}
+
+void THNN_(VolumetricMaxUnpooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int oT,
+          int oW,
+          int oH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  int dimw = 3;
+  int dimh = 2;
+  int dimt = 1;
+  int nbatch = 1;
+  int nslices;
+  int iT;
+  int iH;
+  int iW;
+  real *input_data;
+  real *output_data;
+  THIndex_t *indices_data;
+
+  THNN_(VolumetricMaxUnpooling_shapeCheck)(
+        state, input, NULL, indices,
+        oT, oW, oH, dT, dW, dH, pT, pW, pH);
+
+  if (input->nDimension == 5)
+  {
+    nbatch = input->size[0];
+    dimt++;
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimt-1];
+  iT = input->size[dimt];
+  iH = input->size[dimh];
+  iW = input->size[dimw];
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+  indices = THIndexTensor_(newContiguous)(indices);
+
+  /* resize output */
+  if (input->nDimension == 4)
+  {
+    THTensor_(resize4d)(output, nslices, oT, oH, oW);
+    THTensor_(zero)(output);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+    THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
+      input_data, output_data,
+      indices_data,
+      nslices,
+      iT, iW, iH,
+      oT, oW, oH,
+      dT, dW, dH, pT, pW, pH
+    );
+  }
+  else
+  {
+    int p;
+
+    THTensor_(resize5d)(output, nbatch, nslices, oT, oH, oW);
+    THTensor_(zero)(output);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
+        input_data+p*nslices*iT*iW*iH,
+        output_data+p*nslices*oT*oW*oH,
+        indices_data+p*nslices*iT*iW*iH,
+        nslices,
+        iT, iW, iH,
+        oT, oW, oH,
+        dT, dW, dH,
+        pT, pW, pH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+  THIndexTensor_(free)(indices);
+}
+
+static void THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          THIndex_t *ind_p,
+          int nslices,
+          int iT,
+          int iW,
+          int iH,
+          int oT,
+          int oW,
+          int oH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  int k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    int ti, i, j, maxz, maxy, maxx;
+    for (ti = 0; ti < iT; ti++)
+    {
+      for (i = 0; i < iH; i++)
+      {
+        for (j = 0; j < iW; j++)
+        {
+          int start_t = ti * dT - pT;
+          int start_h = i * dH - pH;
+          int start_w = j * dW - pW;
+
+          real *gradInput_p_k = gradInput_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
+          THIndex_t *ind_p_k = ind_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
+
+          maxz = ((unsigned char*)(ind_p_k))[0]; /* retrieve position of max */
+          maxy = ((unsigned char*)(ind_p_k))[1];
+          maxx = ((unsigned char*)(ind_p_k))[2];
+
+          if (start_t+maxz<0 || start_h+maxy<0 || start_w+maxx<0
+	      || start_t+maxz>=oT || start_h+maxy>=oH || start_w+maxx>=oW)
+          {
+            THError(
+              "invalid max index z= %d, y= %d, x= %d, oT= %d, oW= %d, oH= %d",
+              start_t+maxz, start_h+maxy, start_w+maxx, oT, oW, oH
+            );
+          }
+          *gradInput_p_k = gradOutput_p[k*oT*oW*oH + oH*oW*(start_t+maxz)
+					+ oW*(start_h+maxy) + (start_w+maxx)]; /* update gradient */
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricMaxUnpooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int oT,
+          int oW,
+          int oH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  int dimw = 3;
+  int dimh = 2;
+  int dimt = 1;
+  int nbatch = 1;
+  int nslices;
+  int iT;
+  int iH;
+  int iW;
+  real *gradInput_data;
+  real *gradOutput_data;
+  THIndex_t *indices_data;
+
+  THNN_(VolumetricMaxUnpooling_shapeCheck)(
+        state, input, gradOutput, indices,
+        oT, oW, oH, dT, dW, dH, pT, pW, pH);
+
+  // TODO: check gradOutput shape
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  indices = THIndexTensor_(newContiguous)(indices);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 5)
+  {
+    nbatch = input->size[0];
+    dimt++;
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimt-1];
+  iT = input->size[dimt];
+  iH = input->size[dimh];
+  iW = input->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THIndexTensor_(data)(indices);
+
+  /* backprop */
+  if (input->nDimension == 4)
+  {
+    THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
+      gradInput_data, gradOutput_data,
+      indices_data,
+      nslices,
+      iT, iW, iH,
+      oT, oW, oH,
+      dT, dW, dH,
+      pT, pW, pH
+    );
+  }
+  else
+  {
+    int p;
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
+        gradInput_data+p*nslices*iT*iW*iH,
+        gradOutput_data+p*nslices*oT*oW*oH,
+        indices_data+p*nslices*iT*iW*iH,
+        nslices,
+        iT, iW, iH,
+        oT, oW, oH,
+        dT, dW, dH,
+        pT, pW, pH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+  THIndexTensor_(free)(indices);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricReplicationPadding.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricReplicationPadding.c
new file mode 100644
index 000000000..4d8993ec2
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricReplicationPadding.c
@@ -0,0 +1,357 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricReplicationPadding.c"
+#else
+
+static inline void THNN_(VolumetricReplicationPadding_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         int pleft, int pright,
+                         int ptop, int pbottom,
+                         int pfront, int pback) {
+  int dimw = 3;
+  int dimh = 2;
+  int dimd = 1;
+  int dimslices = 0;
+  long nslices;
+  long idepth;
+  long iheight;
+  long iwidth;
+  long odepth;
+  long oheight;
+  long owidth;
+
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+		"4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+  if (input->nDimension == 5)
+  {
+    dimw++;
+    dimh++;
+    dimd++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  idepth = input->size[dimd];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  odepth = idepth + pfront + pback;
+  oheight = iheight + ptop + pbottom;
+  owidth  = iwidth + pleft + pright;
+
+  THArgCheck(owidth >= 1 || oheight >= 1 || odepth >= 1, 2,
+             "input (D: %d H: %d, W: %d)is too small."
+             " Calculated output D: %d H: %d W: %d",
+             idepth, iheight, iwidth, odepth, oheight, owidth);
+
+  if (gradOutput != NULL) {
+    THArgCheck(nslices == THTensor_(size)(gradOutput, dimslices), 3,
+               "gradOutput width unexpected. Expected: %d, Got: %d",
+               nslices, THTensor_(size)(gradOutput, dimslices));
+    THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
+               "gradOutput width unexpected. Expected: %d, Got: %d",
+               owidth, THTensor_(size)(gradOutput, dimw));
+    THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
+               "gradOutput height unexpected. Expected: %d, Got: %d",
+               oheight, THTensor_(size)(gradOutput, dimh));
+    THArgCheck(odepth == THTensor_(size)(gradOutput, dimd), 3,
+               "gradOutput depth unexpected. Expected: %d, Got: %d",
+               odepth, THTensor_(size)(gradOutput, dimd));
+  }
+}
+
+static void THNN_(VolumetricReplicationPadding_updateOutput_frame)(
+  real *input_p, real *output_p,
+  long nslices,
+  long iwidth, long iheight, long idepth,
+  long owidth, long oheight, long odepth,
+  int pleft, int pright,
+  int ptop, int pbottom,
+  int pfront, int pback)
+{
+  int iStartX = fmax(0, -pleft);
+  int iStartY = fmax(0, -ptop);
+  int iStartZ = fmax(0, -pfront);
+  int oStartX = fmax(0, pleft);
+  int oStartY = fmax(0, ptop);
+  int oStartZ = fmax(0, pfront);
+
+  long k, ip_x, ip_y, ip_z;
+#pragma omp parallel for private(k, ip_x, ip_y, ip_z)
+  for (k = 0; k < nslices; k++) {
+    long i, j, z;
+    for (z = 0; z < odepth; z++) {
+      for (i = 0; i < oheight; i++) {
+        for (j = 0; j < owidth; j++) {
+          if (j < pleft) {
+            ip_x = pleft;
+          } else if (j >= pleft && j < iwidth + pleft) {
+            ip_x = j;
+          } else {
+            ip_x = iwidth + pleft - 1;
+          }
+          ip_x = ip_x - oStartX + iStartX;
+
+          if (i < ptop) {
+            ip_y = ptop;
+          } else if (i >= ptop && i < iheight + ptop) {
+            ip_y = i;
+          } else {
+            ip_y = iheight + ptop - 1;
+          }
+          ip_y = ip_y - oStartY + iStartY;
+
+          if (z < pfront) {
+            ip_z = pfront;
+          } else if (z >= pfront && z < idepth + pfront) {
+            ip_z = z;
+          } else {
+            ip_z = idepth + pfront - 1;
+          }
+          ip_z = ip_z - oStartZ + iStartZ;
+
+          real *dest_p = output_p + k * owidth * oheight * odepth +
+              z * owidth * oheight + i * owidth + j;
+          real *src_p = input_p + k * iwidth * iheight * idepth +
+              ip_z * iwidth * iheight + ip_y * iwidth + ip_x;
+          *dest_p = *src_p;
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricReplicationPadding_updateOutput)(THNNState *state,
+                                                      THTensor *input,
+                                                      THTensor *output,
+                                                      int pleft, int pright,
+                                                      int ptop, int pbottom,
+                                                      int pfront, int pback)
+{
+  int dimw = 3;
+  int dimh = 2;
+  int dimd = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long idepth;
+  long iheight;
+  long iwidth;
+  long odepth;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+
+THNN_(VolumetricReplicationPadding_shapeCheck)(
+      state, input, NULL, pleft, pright,
+      ptop, pbottom, pfront, pback);
+
+  if (input->nDimension == 5)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimd++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  idepth = input->size[dimd];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  odepth = idepth + pfront + pback;
+  oheight = iheight + ptop + pbottom;
+  owidth  = iwidth + pleft + pright;
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  /* resize output */
+  if (input->nDimension == 4)
+  {
+    THTensor_(resize4d)(output, nslices, odepth, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(VolumetricReplicationPadding_updateOutput_frame)(
+         input_data, output_data, nslices, iwidth, iheight, idepth,
+         owidth, oheight, odepth, pleft, pright, ptop, pbottom, pfront,
+         pback);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize5d)(output, nbatch, nslices, odepth, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(VolumetricReplicationPadding_updateOutput_frame)(
+        input_data + p * nslices * iwidth * iheight * idepth,
+        output_data + p * nslices * owidth * oheight * odepth,
+        nslices,
+        iwidth, iheight, idepth,
+        owidth, oheight, odepth,
+        pleft, pright,
+        ptop, pbottom,
+        pfront, pback);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(VolumetricReplicationPadding_updateGradInput_frame)(
+  real *ginput_p, real *goutput_p,
+  long nslices,
+  long iwidth, long iheight, long idepth,
+  long owidth, long oheight, long odepth,
+  int pleft, int pright,
+  int ptop, int pbottom,
+  int pfront, int pback)
+{
+  int iStartX = fmax(0, -pleft);
+  int iStartY = fmax(0, -ptop);
+  int iStartZ = fmax(0, -pfront);
+  int oStartX = fmax(0, pleft);
+  int oStartY = fmax(0, ptop);
+  int oStartZ = fmax(0, pfront);
+
+  long k, ip_x, ip_y, ip_z;
+#pragma omp parallel for private(k, ip_x, ip_y, ip_z)
+  for (k = 0; k < nslices; k++) {
+    long i, j, z;
+    for (z = 0; z < odepth; z++) {
+      for (i = 0; i < oheight; i++) {
+        for (j = 0; j < owidth; j++) {
+          if (j < pleft) {
+            ip_x = pleft;
+          } else if (j >= pleft && j < iwidth + pleft) {
+            ip_x = j;
+          } else {
+            ip_x = iwidth + pleft - 1;
+          }
+          ip_x = ip_x - oStartX + iStartX;
+
+          if (i < ptop) {
+            ip_y = ptop;
+          } else if (i >= ptop && i < iheight + ptop) {
+            ip_y = i;
+          } else {
+            ip_y = iheight + ptop - 1;
+          }
+          ip_y = ip_y - oStartY + iStartY;
+
+          if (z < pfront) {
+            ip_z = pfront;
+          } else if (z >= pfront && z < idepth + pfront) {
+            ip_z = z;
+          } else {
+            ip_z = idepth + pfront - 1;
+          }
+          ip_z = ip_z - oStartZ + iStartZ;
+
+          real *src_p = goutput_p + k * owidth * oheight * odepth +
+              z * owidth * oheight + i * owidth + j;
+          real *dest_p = ginput_p + k * iwidth * iheight * idepth +
+              ip_z * iwidth * iheight + ip_y * iwidth + ip_x;
+          *dest_p += *src_p;
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricReplicationPadding_updateGradInput)(THNNState *state,
+                                                         THTensor *input,
+                                                         THTensor *gradOutput,
+                                                         THTensor *gradInput,
+                                                         int pleft, int pright,
+                                                         int ptop, int pbottom,
+                                                         int pfront, int pback)
+{
+  int dimw = 3;
+  int dimh = 2;
+  int dimd = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long idepth;
+  long iheight;
+  long iwidth;
+  long odepth;
+  long oheight;
+  long owidth;
+
+  if (input->nDimension == 5)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimd++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  idepth = input->size[dimd];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  odepth = idepth + pfront + pback;
+  oheight = iheight + ptop + pbottom;
+  owidth  = iwidth + pleft + pright;
+
+
+THNN_(VolumetricReplicationPadding_shapeCheck)(
+      state, input, NULL, pleft, pright,
+      ptop, pbottom, pfront, pback);
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (input->nDimension == 4) {
+    THNN_(VolumetricReplicationPadding_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      nslices,
+      iwidth, iheight, idepth,
+      owidth, oheight, odepth,
+      pleft, pright,
+      ptop, pbottom,
+      pfront, pback);
+  } else {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++) {
+      THNN_(VolumetricReplicationPadding_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + p * nslices * idepth * iheight * iwidth,
+        THTensor_(data)(gradOutput) + p * nslices * odepth * oheight * owidth,
+        nslices,
+        iwidth, iheight, idepth,
+        owidth, oheight, odepth,
+        pleft, pright,
+        ptop, pbottom,
+        pfront, pback);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricUpSamplingNearest.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricUpSamplingNearest.c
new file mode 100644
index 000000000..9068fb58d
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricUpSamplingNearest.c
@@ -0,0 +1,226 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricUpSamplingNearest.c"
+#else
+
+
+static inline void THNN_(VolumetricUpSamplingNearest_shapeCheck)
+     (THTensor *input, THTensor *gradOutput,
+      int scale_factor) {
+  THArgCheck(input != NULL, 2, "5D input tensor expected but got NULL");
+  THArgCheck(scale_factor > 1, 4,
+	     "scale_factor must be greater than 1, but got: %d", scale_factor);
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+		"4D or 5D input tensor expected but got: %s");
+  if (input->nDimension == 4) {
+    int nChannels    = THTensor_(size)(input, 0);
+    int inputDepth   = THTensor_(size)(input, 1);
+    int inputHeight  = THTensor_(size)(input, 2);
+    int inputWidth   = THTensor_(size)(input, 3);
+    int outputDepth  = inputDepth  * scale_factor;
+    int outputHeight = inputHeight * scale_factor;
+    int outputWidth  = inputWidth  * scale_factor;
+    if (gradOutput != NULL) {
+      THNN_CHECK_DIM_SIZE(gradOutput, 4, 0, nChannels);
+      THNN_CHECK_DIM_SIZE(gradOutput, 4, 1, outputDepth);
+      THNN_CHECK_DIM_SIZE(gradOutput, 4, 2, outputHeight);
+      THNN_CHECK_DIM_SIZE(gradOutput, 4, 3, outputWidth);
+    }
+  } else {
+    int nBatch       = THTensor_(size)(input, 0);
+    int nChannels    = THTensor_(size)(input, 1);
+    int inputDepth   = THTensor_(size)(input, 2);
+    int inputHeight  = THTensor_(size)(input, 3);
+    int inputWidth   = THTensor_(size)(input, 4);
+    int outputDepth  = inputDepth  * scale_factor;
+    int outputHeight = inputHeight * scale_factor;
+    int outputWidth  = inputWidth  * scale_factor;
+    if (gradOutput != NULL) {
+      THNN_CHECK_DIM_SIZE(gradOutput, 5, 0, nBatch);
+      THNN_CHECK_DIM_SIZE(gradOutput, 5, 1, nChannels);
+      THNN_CHECK_DIM_SIZE(gradOutput, 5, 2, outputDepth);
+      THNN_CHECK_DIM_SIZE(gradOutput, 5, 3, outputHeight);
+      THNN_CHECK_DIM_SIZE(gradOutput, 5, 4, outputWidth);
+    }
+  }
+}
+
+void THNN_(VolumetricUpSamplingNearest_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int scale_factor)
+{
+  THNN_(VolumetricUpSamplingNearest_shapeCheck)(input, NULL, scale_factor);
+  int inputDepth   = THTensor_(size)(input, input->nDimension-3);
+  int inputHeight  = THTensor_(size)(input, input->nDimension-2);
+  int inputWidth   = THTensor_(size)(input,  input->nDimension-1);
+  int outputDepth  = inputDepth * scale_factor;
+  int outputHeight = inputHeight * scale_factor;
+  int outputWidth  = inputWidth * scale_factor;
+
+  if (input->nDimension == 4) {
+    THTensor_(resize4d)(output,
+			THTensor_(size)(input, 0),
+			outputDepth, outputHeight, outputWidth);
+  } else {
+    THTensor_(resize5d)(output,
+			THTensor_(size)(input, 0),
+			THTensor_(size)(input, 1),
+			outputDepth, outputHeight, outputWidth);
+  }
+
+  int dT = scale_factor;
+  int dW = scale_factor;
+  int dH = scale_factor;
+  int xDim = input->nDimension-3;
+  int yDim = input->nDimension-2;
+  int zDim = input->nDimension-1;
+
+  // dims
+  int idim = input->nDimension;
+  int osz0 = output->size[0];
+  int osz1 = output->size[1];
+  int osz2 = output->size[2];
+  int osz3 = output->size[3];
+  int osz4 = 1;
+  if (idim > 4) {
+    osz4 = output->size[4];
+  }
+
+  // get strides
+  long *is = input->stride;
+  long *os = output->stride;
+
+  // get raw pointers
+  real *pin = THTensor_(data)(input);
+  real *pout = THTensor_(data)(output);
+
+  // perform the upsampling
+  int i0, i1, i2, i3, i4, isrc, idst;
+  int iout[5];  // Output indices
+  int iin[5];  // Input indices
+
+  for (i0 = 0; i0 < osz0; i0++) {
+    iout[0] = i0;
+    iin[0] = i0;
+    for (i1 = 0; i1 < osz1; i1++) {
+      iout[1] = i1;
+      iin[1] = i1;
+      for (i2 = 0; i2 < osz2; i2++) {
+        iout[2] = i2;
+        iin[2] = i2;
+        for (i3 = 0; i3 < osz3; i3++) {
+          iout[3] = i3;
+          iin[3] = i3;
+          for (i4 = 0; i4 < osz4; i4++) {
+            iout[4] = i4;
+            iin[4] = i4;
+
+            // set the indices for the upsampled dimensions
+            iin[xDim] = iout[xDim] / dW;
+            iin[yDim] = iout[yDim] / dH;
+            iin[zDim] = iout[zDim] / dT;
+
+            idst = i0*os[0] + i1*os[1] + i2*os[2] + i3*os[3];
+            isrc = iin[0]*is[0] + iin[1]*is[1] + iin[2]*is[2] + iin[3]*is[3];
+            if (idim > 4) {
+              idst += i4*os[4];
+              isrc += iin[4]*is[4];
+            }
+
+            pout[idst] = pin[isrc];
+          }
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricUpSamplingNearest_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int scale_factor)
+{
+  THNN_(VolumetricUpSamplingNearest_shapeCheck)(input, gradOutput, scale_factor);
+  THTensor_(resizeAs)(gradInput, input);
+
+  int dW = scale_factor;
+  int dH = scale_factor;
+  int dT = scale_factor;
+  int xDim = gradInput->nDimension-3;
+  int yDim = gradInput->nDimension-2;
+  int zDim = gradInput->nDimension-1;
+
+  // dims
+  int idim = gradInput->nDimension;  // Guaranteed to be between 3 and 5
+  int isz0 = gradInput->size[0];
+  int isz1 = gradInput->size[1];
+  int isz2 = gradInput->size[2];
+  int isz3 = gradInput->size[3];
+  int isz4 = 1;
+  if (idim > 4) {
+    isz4 = gradInput->size[4];
+  }
+
+  // get strides
+  long *is = gradInput->stride;
+  long *os = gradOutput->stride;
+
+  // get raw pointers
+  real *pin = THTensor_(data)(gradInput);
+  real *pout = THTensor_(data)(gradOutput);
+
+  // perform the upsampling
+  int i0, i1, i2, i3, i4, isrc, idst, x, y, z;
+  int iin[5];  // Input indices
+  int iout[5];  // Output indices
+
+  THTensor_(zero)(gradInput);
+
+  for (i0 = 0; i0 < isz0; i0++) {
+    iin[0] = i0;
+    iout[0] = i0;
+    for (i1 = 0; i1 < isz1; i1++) {
+      iin[1] = i1;
+      iout[1] = i1;
+      for (i2 = 0; i2 < isz2; i2++) {
+        iin[2] = i2;
+        iout[2] = i2;
+        for (i3 = 0; i3 < isz3; i3++) {
+          iin[3] = i3;
+          iout[3] = i3;
+
+          for (i4 = 0; i4 < isz4; i4++) {
+            iin[4] = i4;
+            iout[4] = i4;
+
+            idst = i0*is[0] + i1*is[1] + i2*is[2] + i3*is[3];
+            if (idim > 4) {
+              idst += i4*is[4];
+            }
+
+            // Now accumulate the gradients from gradOutput
+            for (z = 0; z < dT; z++) {
+              for (y = 0; y < dH; y++) {
+                for (x = 0; x < dW; x++) {
+                  iout[xDim] = dW * iin[xDim] + x;
+                  iout[yDim] = dH * iin[yDim] + y;
+                  iout[zDim] = dT * iin[zDim] + z;
+                  isrc = iout[0]*os[0] + iout[1]*os[1] + iout[2]*os[2] + iout[3]*os[3];
+                  if (idim > 4) {
+                    isrc += iout[4]*os[4];
+                  }
+                  pin[idst] += pout[isrc];
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricUpSamplingTrilinear.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricUpSamplingTrilinear.c
new file mode 100644
index 000000000..f2b04dba9
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricUpSamplingTrilinear.c
@@ -0,0 +1,213 @@
+// Adapted from interp.cpp from Caffe util by Pauline Luc
+// Originally developed by George Papandreou
+
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricUpSamplingTrilinear.c"
+#else
+
+static inline void THNN_(VolumetricUpSamplingTrilinear_shapeCheck)
+     (THTensor *input, THTensor *gradOutput,
+      int nBatch, int nChannels,
+      int inputDepth, int inputHeight, int inputWidth,
+      int outputDepth, int outputHeight, int outputWidth) {
+  THArgCheck(inputDepth > 0 && inputHeight > 0 && inputWidth > 0
+	     && outputDepth > 0 && outputHeight > 0 && outputWidth > 0, 2,
+	     "input and output sizes should be greater than 0,"
+	     " but got input (D: %d, H: %d, W: %d) output (D: %d, H: %d, W: %d)",
+	     inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth);
+  if (input != NULL) {
+    THNN_ARGCHECK(input->nDimension == 5, 2, input,
+		  "5D input tensor expected but got: %s");
+  }
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 0, nBatch);
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 1, nChannels);
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 2, outputDepth);
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 3, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 4, outputWidth);
+  }
+}
+
+void THNN_(VolumetricUpSamplingTrilinear_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int outputDepth,
+    int outputHeight,
+    int outputWidth){
+
+  int nbatch = THTensor_(size)(input, 0);
+  int channels = THTensor_(size)(input, 1);
+  int inputDepth = THTensor_(size)(input, 2);
+  int inputHeight = THTensor_(size)(input, 3);
+  int inputWidth = THTensor_(size)(input, 4);
+
+  THNN_(VolumetricUpSamplingTrilinear_shapeCheck)
+    (input, NULL,
+     nbatch, channels,
+     inputDepth, inputHeight, inputWidth,
+     outputDepth, outputHeight, outputWidth);
+
+  input = THTensor_(newContiguous)(input);
+  THTensor_(resize5d)(output,
+		      THTensor_(size)(input, 0),
+		      THTensor_(size)(input, 1),
+		      outputDepth, outputHeight, outputWidth);
+  THTensor_(zero)(output);
+  real *idata = THTensor_(data)(input);
+  real *odata = THTensor_(data)(output);
+  channels = nbatch * channels;
+  THAssert(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 &&
+           outputDepth > 0 && outputHeight > 0 && outputWidth > 0);
+  // special case: just copy
+  if (inputDepth == outputDepth && inputHeight == outputHeight && inputWidth == outputWidth) {
+    for (int t2 = 0; t2 < outputDepth; ++t2) {
+      const int t1 = t2;
+      for (int h2 = 0; h2 < outputHeight; ++h2) {
+        const int h1 = h2;
+        for (int w2 = 0; w2 < outputWidth; ++w2) {
+          const int w1 = w2;
+          const real* pos1 = &idata[t1 * inputHeight * inputWidth + h1 * inputWidth + w1];
+          real* pos2 = &odata[t2 * outputHeight * outputWidth + h2 * outputWidth + w2];
+          for (int c = 0; c < channels; ++c) {
+            pos2[0] = pos1[0];
+            pos1 += inputWidth * inputHeight * inputDepth;
+            pos2 += outputWidth * outputHeight * outputDepth;
+          }
+        }
+      }
+    }
+    return;
+  }
+  const float rdepth  = (outputDepth > 1) ? (float)(inputDepth - 1)/(outputDepth - 1) : 0.f;
+  const float rheight = (outputHeight > 1) ? (float)(inputHeight - 1)/(outputHeight - 1) : 0.f;
+  const float rwidth  = (outputWidth > 1) ? (float)(inputWidth - 1) / (outputWidth - 1) : 0.f;
+  for (int t2 = 0; t2 < outputDepth; ++t2) {
+    const float t1r = rdepth * t2;
+    const int t1 = t1r;
+    const int t1p = (t1 < inputDepth - 1) ? 1 : 0;
+    const real t1lambda = t1r - t1;
+    const real t0lambda = (real)1. - t1lambda;
+    for (int h2 = 0; h2 < outputHeight; ++h2) {
+      const float h1r = rheight * h2;
+      const int h1 = h1r;
+      const int h1p = (h1 < inputHeight - 1) ? 1 : 0;
+      const real h1lambda = h1r - h1;
+      const real h0lambda = (real)1. - h1lambda;
+      for (int w2 = 0; w2 < outputWidth; ++w2) {
+        const float w1r = rwidth * w2;
+        const int w1 = w1r;
+        const int w1p = (w1 < inputWidth - 1) ? 1 : 0;
+        const real w1lambda = w1r - w1;
+        const real w0lambda = (real)1. - w1lambda;
+        const real* pos1 = &idata[t1 * inputHeight * inputWidth + h1 * inputWidth + w1];
+        real* pos2 = &odata[t2 * outputHeight * outputWidth + h2 * outputWidth + w2];
+        for (int c = 0; c < channels; ++c) {
+          pos2[0] = t0lambda * (h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[w1p])
+                              + h1lambda * (w0lambda * pos1[h1p * inputWidth]
+                                          + w1lambda * pos1[h1p * inputWidth + w1p]))
+                  + t1lambda * (h0lambda * (w0lambda * pos1[t1p * inputHeight * inputWidth]
+                                          + w1lambda * pos1[t1p * inputHeight * inputWidth
+                                                            + w1p])
+                              + h1lambda * (w0lambda * pos1[t1p * inputHeight * inputWidth
+                                                            + h1p * inputWidth]
+                                          + w1lambda * pos1[t1p * inputHeight * inputWidth
+                                                            + h1p * inputWidth + w1p]));
+          pos1 += inputWidth * inputHeight * inputDepth;
+          pos2 += outputWidth * outputHeight * outputDepth;
+        }
+      }
+    }
+  }
+  THTensor_(free)(input);
+}
+
+void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)(
+    THNNState *state,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int nbatch,
+    int channels,
+    int inputDepth,
+    int inputHeight,
+    int inputWidth,
+    int outputDepth,
+    int outputHeight,
+    int outputWidth){
+
+  THNN_(VolumetricUpSamplingTrilinear_shapeCheck)
+    (NULL, gradOutput,
+     nbatch, channels,
+     inputDepth, inputHeight, inputWidth,
+     outputDepth, outputHeight, outputWidth);
+
+  THTensor_(resize5d)(gradInput, nbatch, channels, inputDepth, inputHeight, inputWidth);
+  THTensor_(zero)(gradInput);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  real *data1 = THTensor_(data)(gradInput);
+  real *data2 = THTensor_(data)(gradOutput);
+  channels = nbatch * channels;
+
+  // special case: same-size matching grids
+  if (inputDepth == outputDepth && inputHeight == outputHeight && inputWidth == outputWidth) {
+    for (int t2 = 0; t2 < outputDepth; ++t2) {
+      const int t1 = t2;
+      for (int h2 = 0; h2 < outputHeight; ++h2) {
+        const int h1 = h2;
+        for (int w2 = 0; w2 < outputWidth; ++w2) {
+          const int w1 = w2;
+          real* pos1 = &data1[t1 * inputHeight * inputWidth + h1 * inputWidth + w1];
+          const real* pos2 = &data2[t2 * outputHeight * outputWidth + h2 * outputWidth + w2];
+          for (int c = 0; c < channels; ++c) {
+            pos1[0] += pos2[0];
+            pos1 += inputWidth * inputHeight * inputDepth;
+            pos2 += outputWidth * outputHeight * outputDepth;
+          }
+        }
+      }
+    }
+    return;
+  }
+  const float rdepth  = (outputDepth > 1) ? (float)(inputDepth - 1)/(outputDepth - 1) : 0.f;
+  const float rheight = (outputHeight > 1) ? (float)(inputHeight - 1)/(outputHeight - 1) : 0.f;
+  const float rwidth  = (outputWidth > 1) ? (float)(inputWidth - 1)/(outputWidth - 1) : 0.f;
+  for (int t2 = 0; t2 < outputDepth; ++t2) {
+    const float t1r = rdepth * t2;
+    const int t1 = t1r;
+    const int t1p = (t1 < inputDepth - 1) ? 1 : 0;
+    const real t1lambda = t1r - t1;
+    const real t0lambda = (real)1. - t1lambda;
+    for (int h2 = 0; h2 < outputHeight; ++h2) {
+      const float h1r = rheight * h2;
+      const int h1 = h1r;
+      const int h1p = (h1 < inputHeight - 1) ? 1 : 0;
+      const real h1lambda = h1r - h1;
+      const real h0lambda = (real)1. - h1lambda;
+      for (int w2 = 0; w2 < outputWidth; ++w2) {
+        const float w1r = rwidth * w2;
+        const int w1 = w1r;
+        const int w1p = (w1 < inputWidth - 1) ? 1 : 0;
+        const real w1lambda = w1r - w1;
+        const real w0lambda = (real)1. - w1lambda;
+        real* pos1 = &data1[t1 * inputHeight * inputWidth + h1 * inputWidth + w1];
+        const real* pos2 = &data2[t2 * outputHeight * outputWidth + h2 * outputWidth + w2];
+        for (int c = 0; c < channels; ++c) {
+          pos1[0] += t0lambda * h0lambda * w0lambda * pos2[0];
+          pos1[w1p] += t0lambda * h0lambda * w1lambda * pos2[0];
+          pos1[h1p * inputWidth] += t0lambda * h1lambda * w0lambda * pos2[0];
+          pos1[h1p * inputWidth + w1p] += t0lambda * h1lambda * w1lambda * pos2[0];
+          pos1[t1p * inputHeight * inputWidth] += t1lambda * h0lambda * w0lambda * pos2[0];
+          pos1[t1p * inputHeight * inputWidth + w1p] += t1lambda * h0lambda * w1lambda * pos2[0];
+          pos1[t1p * inputHeight * inputWidth + h1p * inputWidth] += t1lambda * h1lambda * w0lambda * pos2[0];
+          pos1[t1p * inputHeight * inputWidth + h1p * inputWidth + w1p] += t1lambda * h1lambda * w1lambda * pos2[0];
+          pos1 += inputWidth * inputHeight * inputDepth;
+          pos2 += outputWidth * outputHeight * outputDepth;
+        }
+      }
+    }
+  }
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/unfold.c b/contrib/lua-torch/nn/lib/THNN/generic/unfold.c
new file mode 100644
index 000000000..14a73b567
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/unfold.c
@@ -0,0 +1,166 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/unfold.c"
+#else
+
+/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
+void THNN_(unfolded_acc)(
+          THTensor *finput,
+          THTensor *input,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          int nInputPlane,
+          int inputWidth,
+          int inputHeight,
+          int outputWidth,
+          int outputHeight)
+{
+  // This function assumes that
+  // outputHeight*dH does not overflow a long
+  // outputWidth*dW does not overflow a long
+
+  int nip;
+
+  real *input_data = THTensor_(data)(input);
+  real *finput_data = THTensor_(data)(finput);
+
+#pragma omp parallel for private(nip)
+  for(nip = 0; nip < nInputPlane; nip++)
+  {
+    int kw, kh, y, x;
+    long ix, iy;
+    for(kh = 0; kh < kH; kh++)
+    {
+      for(kw = 0; kw < kW; kw++)
+      {
+        real *src = finput_data + nip*((size_t)kH*kW*outputHeight*outputWidth) + kh*((size_t)kW*outputHeight*outputWidth) + kw*((size_t)outputHeight*outputWidth);
+        real *dst = input_data + nip*((size_t)inputHeight*inputWidth);
+        if (padW > 0 || padH > 0) {
+          int lpad,rpad;
+          for(y = 0; y < outputHeight; y++) {
+            iy = (long)y*dH - padH + kh;
+            if (iy < 0 || iy >= inputHeight) {
+            } else {
+              if (dW==1){
+                 ix = 0 - padW + kw;
+                 lpad = fmaxf(0,padW-kw);
+                 rpad = fmaxf(0,padW-(kW-kw-1));
+                 real *dst_slice = dst+(size_t)iy*inputWidth+ix+lpad;
+                 THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+lpad, 1, outputWidth - lpad - rpad); /* note: THVector_add could handle 1 value better */
+              }
+              else{
+                for (x=0; x<outputWidth; x++){
+                   ix = (long)x*dW - padW + kw;
+                   if (ix < 0 || ix >= inputWidth){
+                   }else{
+                     real *dst_slice = dst+(size_t)iy*inputWidth+ix;
+                     THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+x, 1, 1);
+                   }
+                }
+              }
+            }
+          }
+        } else {
+          for(y = 0; y < outputHeight; y++) {
+            iy = (long)y*dH + kh;
+            ix = 0 + kw;
+            if (dW == 1 ) {
+               real *dst_slice = dst+(size_t)iy*inputWidth+ix;
+               THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth, 1, outputWidth); /* note: THVector_add could handle 1 value better */
+            }else{
+              for(x = 0; x < outputWidth; x++) {
+                real *dst_slice = dst+(size_t)iy*inputWidth+ix+x*dW;
+                THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+x, 1, 1);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void THNN_(unfolded_copy)(
+          THTensor *finput,
+          THTensor *input,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          int nInputPlane,
+          int inputWidth,
+          int inputHeight,
+          int outputWidth,
+          int outputHeight)
+{
+  // This function assumes that
+  // kH*kW does not overflow an int
+  // nInputPlane*kH*kW does not overflow a long
+  // outputHeight*dH does not overflow a long
+  // outputWidth*dW does not overflow a long
+
+  long k;
+  real *input_data = THTensor_(data)(input);
+  real *finput_data = THTensor_(data)(finput);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < (long)nInputPlane*kH*kW; k++) {
+    long nip = k / (kH*kW);
+    long rest = k % (kH*kW);
+    long kh = rest / kW;
+    long kw = rest % kW;
+    int x, y;
+    long ix, iy;
+    real *dst = finput_data + nip*((size_t)kH*kW*outputHeight*outputWidth) + kh*((size_t)kW*outputHeight*outputWidth) + kw*((size_t)outputHeight*outputWidth);
+    real *src = input_data + nip*((size_t)inputHeight*inputWidth);
+    if (padW > 0 || padH > 0) {
+      long lpad,rpad;
+      for(y = 0; y < outputHeight; y++) {
+        iy = (long)y*dH - padH + kh;
+        if (iy < 0 || iy >= inputHeight) {
+          memset(dst+(size_t)y*outputWidth, 0, sizeof(real)*outputWidth);
+        } else {
+          if (dW==1){
+             ix = 0 - padW + kw;
+             lpad = fmaxf(0,padW-kw);
+             rpad = fmaxf(0,padW-(kW-kw-1));
+             if (outputWidth-rpad-lpad <= 0) {
+                memset(dst+(size_t)y*outputWidth, 0, sizeof(real)*outputWidth);
+             } else {
+                if (lpad > 0) memset(dst+(size_t)y*outputWidth, 0, sizeof(real)*lpad);
+                memcpy(dst+(size_t)y*outputWidth+lpad, src+(size_t)iy*inputWidth+ix+lpad, sizeof(real)*(outputWidth-rpad-lpad));
+                if (rpad > 0) memset(dst+(size_t)y*outputWidth + outputWidth - rpad, 0, sizeof(real)*rpad);
+             }
+          }
+          else{
+            for (x=0; x<outputWidth; x++){
+               ix = (long)x*dW - padW + kw;
+               if (ix < 0 || ix >= inputWidth)
+                 memset(dst+(size_t)y*outputWidth+x, 0, sizeof(real)*1);
+               else
+                 memcpy(dst+(size_t)y*outputWidth+x, src+(size_t)iy*inputWidth+ix, sizeof(real)*(1));
+            }
+          }
+        }
+      }
+    } else {
+      for(y = 0; y < outputHeight; y++) {
+        iy = (long)y*dH + kh;
+        ix = 0 + kw;
+        if (dW == 1)
+           memcpy(dst+(size_t)y*outputWidth, src+(size_t)iy*inputWidth+ix, sizeof(real)*outputWidth);
+        else{
+          for (x=0; x<outputWidth; x++)
+             memcpy(dst+(size_t)y*outputWidth+x, src+(size_t)iy*inputWidth+ix+(long)x*dW, sizeof(real)*(1));
+         }
+      }
+    }
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/init.c b/contrib/lua-torch/nn/lib/THNN/init.c
new file mode 100644
index 000000000..5c8c023dc
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/init.c
@@ -0,0 +1,280 @@
+#include "TH.h"
+#include "THNN.h"
+
+#define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME)
+#define nn_(NAME) TH_CONCAT_3(nn_, Real, NAME)
+
+#define THNN_CHECK_SHAPE(I1, I2)			\
+  if (I1 != NULL && I2 != NULL && !THTensor_(isSameSizeAs)(I1, I2))	\
+    {							\
+       THDescBuff s1 = THTensor_(sizeDesc)(I1);		\
+       THDescBuff s2 = THTensor_(sizeDesc)(I2);		\
+       THError(#I1 " and " #I2 " shapes do not match: "	\
+	       #I1 " %s, " #I2 " %s", s1.str, s2.str);	\
+    }
+
+#define THNN_CHECK_SHAPE_INDICES(I1, I2)             \
+  THLongStorage *size2 = THLongTensor_newSizeOf(I2); \
+  if (I1 != NULL && I2 != NULL && !THTensor_(isSize)(I1, size2)) \
+    {             \
+      THDescBuff s1 = THTensor_(sizeDesc)(I1);       \
+      THDescBuff s2 = THLongTensor_sizeDesc(I2);     \
+      THLongStorage_free(size2);                     \
+      THError(#I1 " and " #I2 " shapes do not match: " \
+        #I1 " %s, " #I2 " %s", s1.str, s2.str);      \
+    } else {      \
+      THLongStorage_free(size2);                     \
+    }
+
+#define THNN_CHECK_NELEMENT(I1, I2) \
+  if (I1 != NULL && I2 != NULL ) {					\
+    ptrdiff_t n1 = THTensor_(nElement)(I1);					\
+    ptrdiff_t n2 = THTensor_(nElement)(I2);	                                \
+    if (n1 != n2)							\
+      {									\
+	THDescBuff s1 = THTensor_(sizeDesc)(I1);			\
+	THDescBuff s2 = THTensor_(sizeDesc)(I2);			\
+	THError(#I1 " and " #I2 " have different number of elements: "	\
+		#I1 "%s has %ld elements, while "			\
+		#I2 "%s has %ld elements", s1.str, n1, s2.str, n2);	\
+      }									\
+  }
+
+#define THNN_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE)			\
+  if (THTensor_(nDimension)(T) != DIM ||				\
+      THTensor_(size)(T, DIM_SIZE) != SIZE) {				\
+      THDescBuff s1 = THTensor_(sizeDesc)(T);				\
+      THError("Need " #T " of dimension %d and " #T ".size[%d] == %d"	\
+	      " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
+  }
+
+#define THNN_CHECK_DIM_SIZE_INDICES(T, DIM, DIM_SIZE, SIZE)			\
+  if (THIndexTensor_(nDimension)(T) != DIM ||				\
+      THIndexTensor_(size)(T, DIM_SIZE) != SIZE) {				\
+      THDescBuff s1 = THIndexTensor_(sizeDesc)(T);				\
+      THError("Need " #T " of dimension %d and " #T ".size[%d] == %d"	\
+        " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
+  }
+
+#define THNN_ARGCHECK(COND, ARG, T, FORMAT)	\
+  if (!(COND)) {				\
+    THDescBuff s1 = THTensor_(sizeDesc)(T);	\
+    THArgCheck(COND, ARG, FORMAT, s1.str);	\
+  }
+
+#include "generic/Abs.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/AbsCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/BCECriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/ClassNLLCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialClassNLLCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/DistKLDivCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/ELU.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/HardShrink.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/HardTanh.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/GatedLinearUnit.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/L1Cost.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/LeakyReLU.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/FusedRNNKernel.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/LogSigmoid.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/LogSoftMax.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/LookupTable.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/MSECriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/MarginCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SoftMarginCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/MultiLabelMarginCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/MultiMarginCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Linear.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/PReLU.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/RReLU.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Sigmoid.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SmoothL1Criterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SoftMax.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SoftPlus.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SoftShrink.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SparseLinear.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/IndexLinear.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Sqrt.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Square.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Tanh.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/Threshold.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/TemporalConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/TemporalSubSampling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/TemporalMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/TemporalRowConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/BatchNormalization.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/unfold.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialConvolutionMap.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialConvolutionMM.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialDepthWiseConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialConvolutionLocal.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialFullConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialFullConvolutionMap.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialDilatedConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialAdaptiveMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialAdaptiveAveragePooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialAveragePooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialFractionalMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialDilatedMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialMaxUnpooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialSubSampling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialUpSamplingNearest.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialUpSamplingBilinear.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricAveragePooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricConvolutionMM.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricFullConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricDilatedConvolution.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricDilatedMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricFractionalMaxPooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricMaxUnpooling.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialReflectionPadding.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/SpatialReplicationPadding.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricReplicationPadding.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricUpSamplingNearest.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/VolumetricUpSamplingTrilinear.c"
+#include "THGenerateFloatTypes.h"
+
diff --git a/contrib/lua-torch/nn/mkdocs.yml b/contrib/lua-torch/nn/mkdocs.yml
new file mode 100644
index 000000000..a37a34fb0
--- /dev/null
+++ b/contrib/lua-torch/nn/mkdocs.yml
@@ -0,0 +1,18 @@
+site_name: nn
+theme : simplex
+repo_url : https://github.com/torch/nn
+use_directory_urls : false
+markdown_extensions: [extra]
+docs_dir : doc
+pages:
+- [index.md, Home]
+- [module.md, Modules, Module Interface]
+- [containers.md, Modules, Containers]
+- [transfer.md, Modules, Transfer Functions]
+- [simple.md, Modules, Simple Layers]
+- [table.md, Modules, Table Layers]
+- [convolution.md, Modules, Convolution Layers]
+- [criterion.md, Criterion, Criterions]
+- [overview.md, Additional Documentation, Overview]
+- [training.md, Additional Documentation, Training]
+- [testing.md, Additional Documentation, Testing]
diff --git a/contrib/lua-torch/nn/test.lua b/contrib/lua-torch/nn/test.lua
new file mode 100755
index 000000000..4e3f627fc
--- /dev/null
+++ b/contrib/lua-torch/nn/test.lua
@@ -0,0 +1,8787 @@
+-- you can easily test specific units like this:
+-- th -lnn -e "nn.test{'LookupTable'}"
+-- th -lnn -e "nn.test{'LookupTable', 'Add'}"
+
+local mytester = torch.Tester()
+local jac
+local sjac
+
+local precision = 1e-5
+local expprecision = 1.1e-4
+
+local nntest = torch.TestSuite()
+
+local function equal(t1, t2, msg)
+   if (torch.type(t1) == "table") then
+      for k, v in pairs(t2) do
+         equal(t1[k], t2[k], msg)
+      end
+   else
+      mytester:eq(t1, t2, 0.00001, msg)
+   end
+end
+
+
+--[[ Generate tests to exercise the tostring component of modules. ]]
+local tostringTestModules = {
+    nnLinear = nn.Linear(1, 2),
+    nnReshape = nn.Reshape(10),
+    nnSpatialZeroPadding = nn.SpatialZeroPadding(1, 1, 1, 1)}
+for test_name, component in pairs(tostringTestModules) do
+  nntest['tostring' .. test_name] =
+    function ()
+      mytester:assert(tostring(component):find(
+                         torch.type(component) .. '(', 1, true) ~= nil,
+                      'nn components should have a descriptive tostring' ..
+                      ' beginning with the classname')
+    end
+end
+
+function nntest.Add()
+   local inj_vals = {math.random(3,5), 1}  -- Also test the inj = 1 spatial case
+   local ini = math.random(3,5)
+   local ink = math.random(3,5)
+
+   for ind, inj in pairs(inj_vals) do
+      local input = torch.Tensor(ini,inj,ink):zero()
+      local module = nn.Add(ini,inj,ink)
+
+      -- 1D
+      local err = jac.testJacobian(module,input)
+      mytester:assertlt(err,precision, 'error on state ')
+
+      local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+      mytester:assertlt(err,precision, 'error on bias ')
+
+      local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+      mytester:assertlt(err,precision, 'error on bias [direct update] ')
+
+      for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+         mytester:assertlt(err, precision, string.format('error on bias [%s]', t))
+      end
+
+      -- 2D
+      local nframe = math.random(50,70)
+      local input = torch.Tensor(nframe, ini,inj,ink):zero()
+
+      local err = jac.testJacobian(module,input)
+      mytester:assertlt(err,precision, 'error on state ')
+
+      local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+      mytester:assertlt(err,precision, 'error on bias ')
+
+      local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+      mytester:assertlt(err,precision, 'error on bias [direct update] ')
+
+      for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+         mytester:assertlt(err, precision, string.format('error on bias [%s]', t))
+      end
+
+      -- IO
+      local ferr,berr = jac.testIO(module,input)
+      mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+      mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+   end
+end
+
+function nntest.Bottle()
+   local ini = 2
+   local inj = 3
+   local ink = 4
+   local out = 5
+   local input = torch.Tensor(ini,inj,ink):normal()
+   local linear = nn.Linear(ink, out)
+   local module1 = nn.Bottle(linear)
+   local module2 = nn.Sequential()
+   module2:add(nn.View(ini*inj, ink))
+   module2:add(linear)
+   module2:add(nn.View(ini, inj, out))
+   local output1 = module1:forward(input)
+   local output2 = module2:forward(input)
+   mytester:eq(output1, output2, 0.0001, 'Bottle output not the same as Module')
+
+   local shape = {4, 5, 6, 7, 8, 1, 3}
+   local input = torch.Tensor(table.unpack(shape)):normal()
+   local module = nn.Sequential()
+   module:add(nn.Squeeze(2))
+   module:add(nn.Linear(3, 3))
+   local module1 = nn.Bottle(module, 3, 2)
+   local outShape = {4, 5, 6, 7, 8, 3}
+   local module2 = nn.Sequential()
+   module2:add(nn.View(4*5*6*7*8, 1, 3))
+   module2:add(module)
+   module2:add(nn.View(table.unpack(outShape)))
+   local output1 = module1:forward(input)
+   local grad = torch.Tensor(output1:size()):normal()
+   local gradOutput1 = module1:backward(input, grad):clone()
+   local output2 = module2:forward(input)
+   local gradOutput2 = module2:backward(input, grad):clone()
+   mytester:eq(output1, output2, 0.0001, 'Bottle output not the same as Module')
+   mytester:eq(gradOutput1, gradOutput2, 0.0001, 'Bottle gradOutput not the same as Module')
+end
+
+function nntest.WeightNorm()
+   local input = torch.rand(10, 5)
+
+   -- temporal convolution
+   local model = nn.WeightNorm(nn.TemporalConvolution(5, 20, 2, 1))
+   local err = nn.Jacobian.testJacobianParameters(model, input,
+                                                model.bias, model.gradBias)
+   mytester:assert(err < precision, 'Temporal Convolution bias')
+   err = nn.Jacobian.testJacobianParameters(model, input,
+                                                model.g, model.gradG)
+   mytester:assert(err < precision, 'Temporal Convolution g')
+   err = nn.Jacobian.testJacobianParameters(model, input,
+                                                model.v, model.gradV)
+   mytester:assert(err < precision, 'Temporal Convolution v')
+
+    -- linear
+   model = nn.WeightNorm(nn.Linear(5, 20))
+   err = nn.Jacobian.testJacobianParameters(model, input,
+                                                model.bias, model.gradBias)
+   mytester:assert(err < precision, 'Linear bias')
+   err = nn.Jacobian.testJacobianParameters(model, input, model.g, model.gradG)
+   mytester:assert(err < precision, 'Linear g')
+   err = nn.Jacobian.testJacobianParameters(model, input,
+                                                model.v, model.gradV)
+   mytester:assert(err < precision, 'Linear v')
+
+   -- euclidean with weight but no bias
+   input = torch.rand(10, 5)
+   model = nn.WeightNorm(nn.Euclidean(5, 20))
+   err = nn.Jacobian.testJacobianParameters(model, input, model.g, model.gradG)
+   mytester:assert(err < precision, 'Euclidean g')
+   err = nn.Jacobian.testJacobianParameters(model, input,
+                                                    model.v, model.gradV)
+   mytester:assert(err < precision, 'Euclidean v')
+
+   -- spatial convolution with 4D weights
+   input = torch.rand(5, 10, 10)
+   model = nn.WeightNorm(nn.SpatialConvolution(5, 20, 2, 2, 3, 3, 1, 1), 2)
+   err = nn.Jacobian.testJacobianParameters(model, input,
+                                                model.bias, model.gradBias)
+   mytester:assert(err < precision, 'Spatial Convolution bias')
+   err = nn.Jacobian.testJacobianParameters(model, input,
+                                                model.g, model.gradG)
+   mytester:assert(err < precision, 'Spatial Convolution g')
+   err = nn.Jacobian.testJacobianParameters(model, input,
+                                                model.v, model.gradV)
+   mytester:assert(err < precision, 'Spatial Convolution v')
+
+   -- linear save/load
+   model = nn.WeightNorm(nn.Linear(5, 20))
+   input = torch.rand(10, 5)
+   local out = model:forward(input)
+   local modelr = torch.deserialize(torch.serialize(model))
+   local outr = modelr:forward(input)
+   mytester:assertTensorEq(out, outr)
+end
+
+function nntest.LinearWeightNorm()
+   local input = torch.rand(10, 5)
+   local model = nn.LinearWeightNorm(5, 20)
+
+   -- check gradient
+   local err = nn.Jacobian.testJacobianParameters(model, input, model.bias, model.gradBias)
+   mytester:assert(err < precision, 'bias')
+   err = nn.Jacobian.testJacobianParameters(model, input, model.g, model.gradG)
+   mytester:assert(err < precision, 'g')
+   err = nn.Jacobian.testJacobianParameters(model, input, model.v, model.gradV)
+   mytester:assert(err < precision, 'v')
+
+   -- check conversion functions
+   local linear = nn.Linear(5,20)
+   local wnFromLin = nn.LinearWeightNorm.fromLinear(linear)
+   local linFromWn = wnFromLin:toLinear()
+
+   local linOut = linear:forward(input)
+   local wnOut = wnFromLin:forward(input)
+   local linFromWnOut = linFromWn:forward(input)
+
+   mytester:assertTensorEq(linOut, wnOut, precision, "outputs are not equivalent")
+   mytester:assertTensorEq(wnOut, linFromWnOut, precision, "outputs are not equivalent")
+
+   -- check conversion with nobias
+   linear = nn.Linear(5,20,false)
+   wnFromLin = nn.LinearWeightNorm.fromLinear(linear)
+   linFromWn = wnFromLin:toLinear()
+
+   linOut = linear:forward(input)
+   wnOut = wnFromLin:forward(input)
+   linFromWnOut = linFromWn:forward(input)
+
+   mytester:assertTensorEq(linear.weight, wnFromLin.weight, precision, "weights are not equivalent")
+   mytester:assert(not wnFromLin.bias)
+   mytester:assert(not linear.bias)
+   mytester:assertTensorEq(linOut, wnOut, precision, "outputs are not equivalent")
+   mytester:assertTensorEq(wnOut, linFromWnOut, precision, "outputs are not equivalent")
+
+   -- check gradient with nobias
+   model = wnFromLin
+
+   err = nn.Jacobian.testJacobianParameters(model, input, model.g, model.gradG)
+   mytester:assert(err < precision, 'g')
+   err = nn.Jacobian.testJacobianParameters(model, input, model.v, model.gradV)
+   mytester:assert(err < precision, 'v')
+end
+
+function nntest.CAdd()
+   local function testBackwardPass(module, input, params, dparams)
+      local err = jac.testJacobian(module,input)
+      mytester:assertlt(err,precision, "error computing gradiens w.r.t. inputs")
+
+      err = jac.testJacobianParameters(module, input, params, dparams)
+      mytester:assertlt(err,precision, "error computing gradients w.r.t params")
+
+      err = jac.testJacobianUpdateParameters(module, input, module.bias)
+      mytester:assertlt(err,precision, "error in update using gradients w.r.t parameters")
+
+      --Test all of the various update methods
+      for test, err in pairs(jac.testAllUpdate(module, input, "bias", "gradBias")) do
+         mytester:assertlt(err, precision, string.format("error on bias [%s]", test))
+      end
+   end
+
+   local function testModuleIO(module, input)
+      local fwdErr,bkwdErr = jac.testIO(module,input)
+      mytester:asserteq(fwdErr, 0, torch.typename(module) .. " - i/o forward err ")
+      mytester:asserteq(bkwdErr, 0, torch.typename(module) .. " - i/o backward err ")
+   end
+
+   local function testCAddWithNonBatchedInput()
+      local channels = math.random(3,5)
+      local width = math.random(3,5)
+      local height = math.random(3,5)
+
+      local input = torch.Tensor(channels, height, width):zero()
+
+      --Per channel bias
+      local module = nn.CAdd(channels, 1, 1)
+      local params, gradParams = module:getParameters()
+
+      testBackwardPass(module, input, params, gradParams)
+
+      input:zero()
+      local output = module:forward(input)
+      mytester:assert(output:isSameSizeAs(input))
+
+      for i = 1, module.bias:view(-1):size(1) do
+         local bias = module.bias:view(-1)[i]
+         local result = output[i]:view(-1)
+         local expectedResult = torch.Tensor({bias}):expandAs(result)
+         mytester:assertTensorEq(result, expectedResult, precision)
+      end
+
+      --Per row bias
+      module = nn.CAdd(1, height, 1)
+      params, gradParams = module:getParameters()
+
+      testBackwardPass(module, input, params, gradParams)
+
+      input:zero()
+      output = module:forward(input)
+      mytester:assert(output:isSameSizeAs(input))
+
+      for i = 1, module.bias:view(-1):size(1) do
+         local bias = module.bias:view(-1)[i]
+         local result = output[{{}, {i}, {}}]:contiguous():view(-1)
+         local expectedResult = torch.Tensor({bias}):expandAs(result)
+         mytester:assertTensorEq(result, expectedResult, precision)
+      end
+
+      --Per column bias
+      module = nn.CAdd(1, 1, width)
+      params, gradParams = module:getParameters()
+
+      testBackwardPass(module, input, params, gradParams)
+
+      input:zero()
+      output = module:forward(input)
+      mytester:assert(output:isSameSizeAs(input))
+
+      for i = 1, module.bias:view(-1):size(1) do
+         local bias = module.bias:view(-1)[i]
+         local result = output[{{}, {}, {i}}]:contiguous():view(-1)
+         local expectedResult = torch.Tensor({bias}):expandAs(result)
+         mytester:assertTensorEq(result, expectedResult, precision)
+      end
+
+      --Per input component bias
+      module = nn.CAdd(channels, height, width)
+      params, gradParams = module:getParameters()
+
+      testBackwardPass(module, input, params, gradParams)
+
+      input:zero()
+      output = module:forward(input)
+
+      mytester:assert(output:isSameSizeAs(input))
+      mytester:assert(module.bias:isSameSizeAs(input))
+      mytester:assertTensorEq(module.bias, output, precision)
+
+      testModuleIO(module, input)
+   end
+
+   local function testCAddWithBatchedInput()
+      local batchSize = math.random(3,5)
+      local channels = math.random(3,5)
+      local width = math.random(3,5)
+      local height = math.random(3,5)
+
+      local input = torch.Tensor(batchSize, channels, height, width):zero()
+      local module = nn.CAdd(batchSize, channels, height, width)
+
+      --Per batch bias
+      local module = nn.CAdd(batchSize, 1, 1, 1)
+      local params, gradParams = module:getParameters()
+
+      testBackwardPass(module, input, params, gradParams)
+
+      input:zero()
+      local output = module:forward(input)
+      mytester:assert(output:isSameSizeAs(input))
+
+      for i = 1, module.bias:view(-1):size(1) do
+         local bias = module.bias:view(-1)[i]
+         local result = output[i]:view(-1)
+         local expectedResult = torch.Tensor({bias}):expandAs(result)
+         mytester:assertTensorEq(result, expectedResult, precision)
+      end
+
+      --Per channel bias
+      module = nn.CAdd(1, channels, 1, 1)
+      params, gradParams = module:getParameters()
+
+      testBackwardPass(module, input, params, gradParams)
+
+      input:zero()
+      output = module:forward(input)
+      mytester:assert(output:isSameSizeAs(input))
+
+      for i = 1, module.bias:view(-1):size(1) do
+         local bias = module.bias:view(-1)[i]
+         local result = output[{{}, {i}, {}, {}}]:contiguous():view(-1)
+         local expectedResult = torch.Tensor({bias}):expandAs(result)
+         mytester:assertTensorEq(result, expectedResult, precision)
+      end
+
+      --Per row bias
+      module = nn.CAdd(1, 1, height, 1)
+      params, gradParams = module:getParameters()
+
+       testBackwardPass(module, input, params, gradParams)
+
+      input:zero()
+      output = module:forward(input)
+      mytester:assert(output:isSameSizeAs(input))
+
+      for i = 1, module.bias:view(-1):size(1) do
+         local bias = module.bias:view(-1)[i]
+         local result = output[{{}, {}, {i}, {}}]:contiguous():view(-1)
+         local expectedResult = torch.Tensor({bias}):expandAs(result)
+         mytester:assertTensorEq(result, expectedResult, precision)
+      end
+
+      --Per column bias
+      module = nn.CAdd(1, 1, 1, width)
+      params, gradParams = module:getParameters()
+
+      testBackwardPass(module, input, params, gradParams)
+
+      input:zero()
+      output = module:forward(input)
+      mytester:assert(output:isSameSizeAs(input))
+
+      for i = 1, module.bias:view(-1):size(1) do
+         local bias = module.bias:view(-1)[i]
+         local result = output[{{}, {}, {}, {i}}]:contiguous():view(-1)
+         local expectedResult = torch.Tensor({bias}):expandAs(result)
+         mytester:assertTensorEq(result, expectedResult, precision)
+      end
+
+      --Per input component bias
+      module = nn.CAdd(batchSize, channels, height, width)
+      params, gradParams = module:getParameters()
+
+      testBackwardPass(module, input, params, gradParams)
+
+      input:zero()
+      output = module:forward(input)
+
+      mytester:assert(output:isSameSizeAs(input))
+      mytester:assert(module.bias:isSameSizeAs(input))
+      mytester:assertTensorEq(module.bias, output, precision)
+
+      testModuleIO(module, input)
+   end
+
+
+   local function testCAddWithLessDimsThanInput()
+      local input = torch.rand(4,5)
+      local module = nn.CAdd(5)
+      local params, gradParams = module:getParameters()
+      testBackwardPass(module, input, params, gradParams)
+
+      input:zero()
+      local output = module:forward(input)
+      local expandedBias = module.bias:view(1,5):expand(4,5):clone()
+      mytester:assert(output:isSameSizeAs(input))
+      mytester:assertTensorEq(expandedBias, output, precision)
+
+      testModuleIO(module, input)
+
+      input = torch.rand(4,5,6)
+      module = nn.CAdd(5,6)
+      params, gradParams = module:getParameters()
+      testBackwardPass(module, input, params, gradParams)
+
+      input:zero()
+      local output = module:forward(input)
+      expandedBias = module.bias:view(1,5,6):expand(4,5,6):clone()
+      mytester:assert(output:isSameSizeAs(input))
+      mytester:assertTensorEq(expandedBias, output, precision)
+
+      testModuleIO(module, input)
+   end
+
+
+   testCAddWithNonBatchedInput()
+   testCAddWithBatchedInput()
+   testCAddWithLessDimsThanInput()
+end
+
+function nntest.CMul()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local inl = math.random(3,5)
+   local input = torch.Tensor(ini,inj,ink):zero()
+   local module = nn.CMul(1, ini, inj, ink, 1)
+
+   -- 1D
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err,precision, 'error on weight ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err,precision, 'error on weight [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   -- 2D
+   local nframe = math.random(3,14)
+   local input = torch.randn(nframe, ini,inj,ink)
+   local output = module:forward(input)
+   local output2 = torch.cmul(input, module.weight:view(1,ini,inj,ink):expandAs(input))
+   mytester:assertTensorEq(output2, output, 0.000001, 'CMul forward 2D err')
+
+   module:zeroGradParameters()
+   local gradWeight = module.gradWeight:clone()
+   local gradInput = module:backward(input, output)
+   local gradInput2 = gradInput:clone():zero()
+   local outputView = output:view(input:size(1), -1)
+   gradInput2:view(input:size(1), -1):addcmul(1, module.weight:view(1,-1):expandAs(outputView), outputView)
+   mytester:assertTensorEq(gradInput2, gradInput, 0.000001, 'CMul updateGradInput 2D err')
+   mytester:assert(gradInput:isSameSizeAs(input), 'CMul gradInput 2D size err')
+
+   local inputView = input:view(nframe, -1)
+   local gradWeightView = gradWeight:view(1, -1)
+   for i=1,nframe do
+      gradWeightView:addcmul(1, inputView[i], outputView[i])
+   end
+   mytester:assertTensorEq(gradWeight, module.gradWeight, 0.000001, 'CMul accGradParameters 2D err')
+   mytester:assert(module.weight:isSameSizeAs(module.gradWeight), 'CMul gradWeight size err')
+
+   -- Expansion
+   input = torch.randn(nframe, ini,inj,ink,inl)
+   output = module:forward(input)
+   output2 = torch.cmul(input, module.weight:expandAs(input))
+   mytester:assertTensorEq(output2, output, 0.000001, 'CMul forward expand err')
+
+   module:zeroGradParameters()
+   gradWeight:zero()
+   gradInput = module:backward(input, output)
+   gradInput2 = gradInput:clone():zero()
+   gradInput2:addcmul(1, module.weight:expandAs(output), output)
+   mytester:assertTensorEq(gradInput2, gradInput, 0.000001, 'CMul updateGradInput expansion err')
+   mytester:assert(gradInput:isSameSizeAs(input), 'CMul gradInput expand size err')
+
+   for i=1,nframe do
+      -- 4 is the [non-batch] singleton dim
+      gradWeight:add(torch.cmul(input[i], output[i]):sum(4))
+   end
+   mytester:assertTensorEq(gradWeight:sum(5), module.gradWeight, 0.000001, 'CMul accGradParameters expand err')
+   mytester:assert(module.weight:isSameSizeAs(module.gradWeight), 'CMul accGradParameters expand size err')
+
+   input:zero()
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err,precision, 'error on weight ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err,precision, 'error on weight [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format('error on weight [%s]', t))
+   end
+
+   -- Non-contiguous input or gradOutput
+   local testModule = nn.CMul(4, 3, 5)
+   local testInput = torch.rand(10, 3, 5):resize(10, 1, 3, 5):expand(10, 4, 3, 5)
+   local testOutput = testModule:forward(testInput)
+
+   mytester:assert(testOutput:isSameSizeAs(testInput), 'CMul non-contiguous forward err')
+
+   local testGradOutput = torch.rand(10, 3, 5):resize(10, 1, 3, 5):expand(10, 4, 3, 5)
+   testOutput = testModule:forward(testInput)
+   local testGradInput = testModule:backward(testOutput, testGradOutput)
+
+   mytester:assert(testGradInput:isSameSizeAs(testGradOutput), 'CMul non-contiguous backward err')
+
+   -- IO
+   local ferr,berr = jac.testIO(module,input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.Contiguous()
+   local module = nn.Contiguous()
+
+   -- Contiguous input
+   local input = torch.rand(30,20,10)
+   local output = module:forward(input)
+
+   mytester:assert(output:ne(input):sum() == 0, 'output not equal to input')
+
+   -- Make input non-contiguous
+   local input2 = output:transpose(1,2)
+   local output2 = module:forward(input2)
+
+   mytester:assert(output2:ne(output:contiguous()):sum() == 0, 'output not equal to input')
+end
+
+function nntest.Dropout()
+   local p = 0.2 --prob of droping out a neuron
+   local input = torch.Tensor(1000):fill((1-p))
+   local module = nn.Dropout(p)
+   -- version 2
+   local output = module:forward(input)
+   mytester:assert(math.abs(output:mean() - (1-p)) < 0.05, 'dropout output')
+   local gradInput = module:backward(input, input)
+   mytester:assert(math.abs(gradInput:mean() - (1-p)) < 0.05, 'dropout gradInput')
+   -- test inplace version
+   local module = nn.Dropout(p,nil,true)
+   local output = module:forward(input:clone())
+   mytester:assert(math.abs(output:mean() - (1-p)) < 0.05, 'dropout output')
+   local gradInput = module:backward(input:clone(), input:clone())
+   mytester:assert(math.abs(gradInput:mean() - (1-p)) < 0.05, 'dropout gradInput')
+
+   -- version 1 (old nnx version)
+   local input = input:fill(1)
+   local module = nn.Dropout(p,true)
+   local output = module:forward(input)
+   mytester:assert(math.abs(output:mean() - (1-p)) < 0.05, 'dropout output')
+   local gradInput = module:backward(input, input)
+   mytester:assert(math.abs(gradInput:mean() - (1-p)) < 0.05, 'dropout gradInput')
+end
+
+function nntest.SpatialDropout()
+   local p = 0.2 --prob of dropiing out a neuron
+   local w = math.random(1,5)
+   local h = math.random(1,5)
+   local nfeats = 1000
+   local input = torch.Tensor(nfeats, w, h):fill(1)
+   local module = nn.SpatialDropout(p)
+   module.train = true
+   local output = module:forward(input)
+   mytester:assert(math.abs(output:mean() - (1-p)) < 0.05, 'dropout output')
+   local gradInput = module:backward(input, input)
+   mytester:assert(math.abs(gradInput:mean() - (1-p)) < 0.05, 'dropout gradInput')
+end
+
+function nntest.SpatialDropoutBatch()
+   local p = 0.2 --prob of dropiing out a neuron
+   local bsz = math.random(1,5)
+   local w = math.random(1,5)
+   local h = math.random(1,5)
+   local nfeats = 1000
+   local input = torch.Tensor(bsz, nfeats, w, h):fill(1)
+   local module = nn.SpatialDropout(p)
+   module.train = true
+   local output = module:forward(input)
+   mytester:assert(math.abs(output:mean() - (1-p)) < 0.05, 'dropout output')
+   local gradInput = module:backward(input, input)
+   mytester:assert(math.abs(gradInput:mean() - (1-p)) < 0.05, 'dropout gradInput')
+end
+
+function nntest.VolumetricDropout()
+   local p = 0.2 --prob of dropiing out a neuron
+   local t = math.random(1,5)
+   local w = math.random(1,5)
+   local h = math.random(1,5)
+   local nfeats = 1000
+   local input = torch.Tensor(nfeats, t, w, h):fill(1)
+   local module = nn.VolumetricDropout(p)
+   module.train = true
+   local output = module:forward(input)
+   mytester:assert(math.abs(output:mean() - (1-p)) < 0.05, 'dropout output')
+   local gradInput = module:backward(input, input)
+   mytester:assert(math.abs(gradInput:mean() - (1-p)) < 0.05, 'dropout gradInput')
+end
+
+function nntest.VolumetricDropoutBatch()
+   local p = 0.2 --prob of dropiing out a neuron
+   local bsz = math.random(1,5)
+   local t = math.random(1,5)
+   local w = math.random(1,5)
+   local h = math.random(1,5)
+   local nfeats = 1000
+   local input = torch.Tensor(bsz, nfeats, t, w, h):fill(1)
+   local module = nn.VolumetricDropout(p)
+   module.train = true
+   local output = module:forward(input)
+   mytester:assert(math.abs(output:mean() - (1-p)) < 0.05, 'dropout output')
+   local gradInput = module:backward(input, input)
+   mytester:assert(math.abs(gradInput:mean() - (1-p)) < 0.05, 'dropout gradInput')
+end
+
+function nntest.ReLU()
+   local input = torch.randn(3,4)
+   local gradOutput = torch.randn(3,4)
+   local module = nn.ReLU()
+   local output = module:forward(input)
+   local output2 = input:clone():gt(input, 0):cmul(input)
+   mytester:assertTensorEq(output, output2, 0.000001, 'ReLU output')
+   local gradInput = module:backward(input, gradOutput)
+   local gradInput2 = input:clone():gt(input, 0):cmul(gradOutput)
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001, 'ReLU gradInput')
+end
+
+function nntest.ReLU6()
+   for inplace = 0, 1 do
+      local input = torch.randn(3, 4):mul(6)
+      local gradOutput = torch.randn(3,4)
+      local module = nn.ReLU6(inplace == 1)
+      local output = module:forward(input:clone())
+      local gt = input:clone():gt(input, 0)
+      local lt = input:clone():lt(input, 6)
+      local output2 = gt:clone():cmul(lt):cmul(input)
+      output2:add(6, input:clone():gt(input, 6))
+      mytester:assertTensorEq(output, output2, 0.000001, 'ReLU6 output '..(inplace and '(inplace)' or '') )
+      local gradInput = module:backward(input, gradOutput:clone())
+      local gradInput2 = gt:clone():cmul(lt):cmul(gradOutput)
+      mytester:assertTensorEq(gradInput, gradInput2, 0.000001, 'ReLU gradInput '..(inplace and '(inplace)' or '') )
+   end
+end
+
+function nntest.GatedLinearUnit()
+   local model = nn.GatedLinearUnit()
+   local t = torch.Tensor({{1, 1}, {2, 2}, {3, 3}})
+   local thalf = torch.Tensor():resizeAs(t):copy(t):narrow(2, 1, 1)
+   mytester:assertTensorEq(
+      thalf:cmul(torch.sigmoid(thalf)),
+      model:forward(t):resizeAs(thalf),
+      0.000001,
+      'Gated Linear output'
+   )
+   t = torch.Tensor({{1, 1, 1, 1}, {2, 2, 2, 2}, {3, 3, 3, 3}})
+   thalf = torch.Tensor():resizeAs(t):copy(t):narrow(2, 1, 2)
+   mytester:assertTensorEq(
+      thalf:cmul(torch.sigmoid(thalf)),
+      model:forward(t),
+      0.000001,
+      'Gated Linear Unit output'
+   )
+
+   local input = torch.rand(1, 10)
+   local err = jac.testJacobian(model, input)
+   mytester:assert(err < precision, 'Gated Linear gradient')
+
+   input = torch.rand(5, 10, 6)
+   model = nn.GatedLinearUnit(2)
+   err = jac.testJacobian(model, input)
+   mytester:assert(err < precision, 'Gated Linear gradient, non-default dim')
+
+   input = torch.rand(5, 10, 6)
+   model = nn.GatedLinearUnit(3)
+   err = jac.testJacobian(model, input)
+   mytester:assert(err < precision, 'Gated Linear gradient, non-default dim')
+
+   input = torch.rand(5, 10)
+   model = nn.Sequential()
+   model:add(nn.Linear(10, 10))
+   model:add(nn.GatedLinearUnit())
+   model:add(nn.ReLU())
+   model:add(nn.LogSoftMax())
+   err = jac.testJacobian(model, input)
+   mytester:assert(err < precision, 'Gated Linear gradient with other layers')
+end
+
+function nntest.CReLU()
+   local function _verifyCReLU(featureMaps, concatenatedFeatureMaps)
+      local rectifiedFeatureMaps = nn.ReLU():forward(featureMaps)
+      local rectifiedNegFeatureMaps = nn.ReLU():forward(-featureMaps)
+
+      mytester:asserteq(concatenatedFeatureMaps:size(1), featureMaps:size(1) * 2,
+                      "CReLU should double the number of feature maps")
+
+      for i =  1, rectifiedFeatureMaps:size(1) do
+         local found = false
+         for j = 1, concatenatedFeatureMaps:size(1) do
+            found =  found or rectifiedFeatureMaps[i]:equal(concatenatedFeatureMaps[j])
+         end
+         mytester:assert(found, "Original (rectified) feature maps should be in the output of CReLU")
+      end
+
+      for i = 1, rectifiedNegFeatureMaps:size(1) do
+         local found = false
+         for j = 1, concatenatedFeatureMaps:size(1) do
+            found =  found or rectifiedFeatureMaps[i]:equal(concatenatedFeatureMaps[j])
+         end
+         mytester:assert(found, "The negative of the original (rectified) feature maps should be in the output of CReLU")
+      end
+   end
+
+   local model = nn.Sequential()
+   model:add(nn.SpatialConvolution(1, 3, 3, 3, 1, 1, 1, 1))
+
+   for _, inplace in pairs({true, false}) do
+      --batched
+      local crelu = nn.CReLU(3, inplace)
+      local input = torch.Tensor(2, 1, 20, 20):uniform()
+      local featureMaps = model:forward(input)
+      local concatenatedFeatureMaps = crelu:forward(featureMaps)
+      for i = 1, input:size(1) do
+         _verifyCReLU(featureMaps[i], concatenatedFeatureMaps[i])
+      end
+
+      --non-batched
+      local input = torch.Tensor(1, 20, 20):uniform()
+      local featureMaps = model:forward(input)
+      local concatenatedFeatureMaps = crelu:forward(featureMaps)
+      _verifyCReLU(featureMaps, concatenatedFeatureMaps)
+   end
+
+   --test gradients w.r.t input
+   local jac = nn.Jacobian
+
+   for _, inplace in pairs({true, false}) do
+      local crelu = nn.CReLU(3, inplace)
+      --batched
+      local input = torch.Tensor(2, 3, 20, 20):uniform()
+      local err = jac.testJacobian(crelu, input)
+      mytester:assertlt(err, precision, "error computing gradients w.r.t. inputs")
+
+      --I/O
+      local fwdErr,bkwdErr = jac.testIO(crelu,input)
+      mytester:asserteq(fwdErr, 0, torch.typename(crelu) .. " - i/o forward err ")
+      mytester:asserteq(bkwdErr, 0, torch.typename(crelu) .. " - i/o backward err ")
+
+      --non-batched
+      input = torch.Tensor(3, 20, 20):uniform()
+      err = jac.testJacobian(crelu,input)
+      mytester:assertlt(err, precision, "error computing gradients w.r.t. inputs")
+
+      --I/O
+      local fwdErr,bkwdErr = jac.testIO(crelu,input)
+      mytester:asserteq(fwdErr, 0, torch.typename(crelu) .. " - i/o forward err ")
+      mytester:asserteq(bkwdErr, 0, torch.typename(crelu) .. " - i/o backward err ")
+   end
+
+end
+
+function nntest.Exp()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ini,inj,ink):zero()
+   local module = nn.Exp()
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.Log()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ini,inj,ink):zero()
+   local module = nn.Log()
+
+   local err = jac.testJacobian(module,input, 0.1, 10)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input, 0.1, 10)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.HardTanh()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ink, inj, ini):zero()
+
+   local module = nn.HardTanh()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision ,  'error on state ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+
+   -- test inclusive bounds -- HardTahn(1,inf) should behave like Threshold(1)
+   local input = torch.Tensor({1})
+   local gradOutput = torch.Tensor({1})
+   local gradOutputClone = gradOutput:clone()
+   local module = nn.HardTanh(1, math.huge, true)
+   local tanhGradInput = module:backward(input, gradOutput)
+
+   local input = input:clone()
+   local gradOutput = gradOutputClone
+   local module  = nn.Threshold(1, 0, true)
+   local threshGradInput = module:backward(input, gradOutput)
+   mytester:assertTensorEq(tanhGradInput, threshGradInput, 0.000001, 'HardTanh gradInput')
+end
+
+function nntest.Clamp()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local max_value =  math.abs(math.random())
+   local min_value = -math.abs(math.random())
+   local input = torch.Tensor(ink, inj, ini):zero()
+
+   local module = nn.Clamp(min_value, max_value)
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision ,  'error on state ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.Abs()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ink, inj, ini):zero()
+
+   local module = nn.Abs()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision ,  'error on state ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.Threshold()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ink, inj, ini):zero()
+
+   local module = nn.Threshold(torch.uniform(-2,2),torch.uniform(-2,2))
+
+   local err = nn.Jacobian.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = nn.Jacobian.testIO(module, input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.ELU()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ink, inj, ini):zero()
+
+   local module = nn.ELU(0.3)
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision ,  'error on state ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.ELUIP()
+   local input = torch.randn(3,4)
+   local input2 = input:clone()
+   local gradOutput = torch.randn(3,4)
+   local gradOutput2 = gradOutput:clone()
+
+   -- Compare in-place to not in-place
+   local module = nn.ELU(0.3, true)
+   local module2 = nn.ELU(0.3, false)
+
+   local output = module:forward(input)
+   local output2 = module2:forward(input2)
+   mytester:assertTensorEq(output, output2, 0.000001, 'ELU output')
+   local gradInput = module:backward(input, gradOutput)
+   local gradInput2 = module2:backward(input2, gradOutput2)
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001, 'ELU gradInput')
+end
+
+function nntest.PReLU()
+   local ini = math.random(3,5)
+   local input = torch.Tensor(ini):zero()
+
+   local module = nn.PReLU(ini)
+
+   -- 1D
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err,precision, 'error on weight ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err,precision, 'error on weight [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                        'error on weight [%s]', t))
+   end
+
+   -- 2D
+   local nframe = math.random(1,7)
+   local input = torch.Tensor(nframe, ini):zero()
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err,precision, 'error on weight ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err,precision, 'error on weight [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                        'error on weight [%s]', t))
+   end
+
+   -- 4D
+   local nframe = math.random(1,7)
+   local kW, kH = math.random(1,8), math.random(1,8)
+   local input = torch.Tensor(nframe, ini, kW, kH):zero()
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err,precision, 'error on weight ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err,precision, 'error on weight [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                        'error on weight [%s]', t))
+   end
+
+   -- IO
+   local ferr,berr = jac.testIO(module,input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.RReLU()
+   local nframe = math.random(1,7)
+   local size = math.random(1,7)
+   local kW, kH = math.random(1,8), math.random(1,8)
+   local input = torch.Tensor(nframe, size, kW, kH):zero()
+
+   local l = 1/math.random(5,8)
+   local u = 1/math.random(3,5)
+
+   -- test in evaluation mode (not inplace), RReLU behaves like LeakyReLU
+   local module = nn.RReLU(l, u, false)
+   mytester:assert(module.train, 'default mode ')
+   module:evaluate()
+
+   -- gradient check
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   -- IO
+   local ferr,berr = jac.testIO(module, input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+
+   -- test training and evalation mode
+   for _,train in ipairs({true,false}) do
+      -- test with separate output buffer and inplace
+      for _,inplace in ipairs({false,true}) do
+         module = nn.RReLU(l, u, inplace)
+         if train then
+            module:training()
+         else
+            module:evaluate()
+         end
+         input = torch.rand(nframe, size, kW, kH) - 0.5
+         input:storage()[1] = -1
+         local original_input = input:clone()
+         local output = module:forward(input)
+         mytester:assert(output:sign():eq(original_input:sign()):all(), 'sign flipped forward ')
+         local gradOutput = torch.ones(output:size())
+         local gradInput = module:backward(input, gradOutput)
+         mytester:assert(gradInput:gt(0):eq(input:ne(0)):all(), 'gradient ')
+         mytester:assert(gradInput:lt(1):eq(input:le(0)):all(), 'backward negative inputs ')
+         mytester:assert(gradInput:eq(1):eq(input:gt(0)):all(), 'backward positive inputs ')
+         if not train then
+            local err = gradInput[input:le(0)]:mean()-(module.lower+module.upper)/2
+            mytester:assertlt(err, precision, 'error on gradient ')
+         end
+
+         input = -torch.rand(1000)
+         module:forward(input) -- fill internal noise tensor
+         local g = module:backward(input, torch.ones(1000))
+         local err = math.abs(g[input:le(0)]:mean()-(module.lower+module.upper)/2)
+         mytester:assertlt(err, 0.05, 'mean deviation of gradient for negative inputs ')
+      end
+   end
+end
+
+function nntest.LeakyReLU()
+   local input = torch.randn(3,4)
+   local gradOutput = torch.randn(3,4)
+   local negval = math.random()
+   local module = nn.LeakyReLU(negval)
+   local output = module:forward(input)
+   local output2 = input:clone():gt(input, 0):cmul(input) + input:clone():le(input,0):cmul(input) * module.negval
+   mytester:assertTensorEq(output, output2, 0.000001, 'LeakyReLU output')
+   local gradInput = module:backward(input, gradOutput)
+   local gradInput2 = input:clone():gt(input, 0):cmul(gradOutput) + input:clone():le(input,0):cmul(gradOutput) * module.negval
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001, 'LeakyReLU gradInput')
+end
+
+function nntest.LeakyReLUIP()
+   local input = torch.randn(3,4)
+   local gradOutput = torch.randn(3,4)
+   local negval = math.random()
+   local module = nn.LeakyReLU(negval,true)
+   local output = input:clone():gt(input, 0):cmul(input) + input:clone():le(input,0):cmul(input) * module.negval
+   local output2 = module:forward(input)
+   mytester:assertTensorEq(output2, output, 0.000001, 'LeakyReLU output')
+   local gradInput = input:clone():gt(input, 0):cmul(gradOutput) + input:clone():le(input,0):cmul(gradOutput) * module.negval
+   local gradInput2 = module:backward(input, gradOutput)
+   mytester:assertTensorEq(gradInput2, gradInput, 0.000001, 'LeakyReLU gradInput')
+end
+
+function nntest.HardShrink()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ink, inj, ini):zero()
+
+   local module = nn.HardShrink(math.random()/2)
+
+   local err = nn.Jacobian.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = nn.Jacobian.testIO(module, input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.SoftShrink()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ink, inj, ini):zero()
+
+   local module = nn.SoftShrink(math.random()/2)
+
+   local err = nn.Jacobian.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = nn.Jacobian.testIO(module, input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.Power()
+   local in1 = torch.rand(5,7)
+   local module = nn.Power(2)
+   local out = module:forward(in1)
+   local err = out:dist(in1:cmul(in1))
+   mytester:assertlt(err, 1e-15, torch.typename(module) .. ' - forward err ')
+
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local pw = torch.uniform()*math.random(1,10)
+   local input = torch.Tensor(ink, inj, ini):zero()
+
+   local module = nn.Power(pw)
+
+   local err = nn.Jacobian.testJacobian(module, input, 0.1, 2)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = nn.Jacobian.testIO(module,input, 0.1, 2)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.Normalize()
+   -- compare forward against torch implementation
+   -- and check gradient
+   for _,p in pairs({1,2,3,4,1.5}) do
+      local ini = math.random(3,10)
+      local input = torch.randn(ini)
+      local module = nn.Normalize(p)
+      local out = module:forward(input)
+      local expected = torch.div(input,input:norm(p))
+      mytester:assertTensorEq(out, expected, 1e-7,
+                              torch.typename(module) ..' (' .. p ..') - forward err ')
+
+      local err = jac.testJacobian(module, input, -2, 2)
+      mytester:assertlt(err, precision, 'error norm '..p..' on state ')
+   end
+
+   -- batch mode
+   for _,p in pairs({1,2,3,4,torch.uniform()*math.random(1,10),math.huge}) do
+      local ini = math.random(3,5)
+      local inj = math.random(3,5)
+      local ink = math.random(3,5)
+      local input = torch.Tensor(inj, ini):zero()
+
+      local module = nn.Normalize(p)
+
+      local err = jac.testJacobian(module, input, -2, 2)
+      mytester:assertlt(err, precision, 'error norm '..p..' on state ')
+   end
+
+   -- test IO correctness
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(inj, ini):zero()
+
+   local module = nn.Normalize(2)
+
+   local ferr, berr = jac.testIO(module,input, 0.1, 2)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+
+end
+
+function nntest.Square()
+   local in1 = torch.rand(5,7)
+   local module = nn.Square()
+   local out = module:forward(in1)
+   local err = out:dist(in1:cmul(in1))
+   mytester:assertlt(err, 1e-15, torch.typename(module) .. ' - forward err ')
+
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ink, inj, ini):zero()
+
+   local module = nn.Square()
+
+   local err = nn.Jacobian.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = nn.Jacobian.testIO(module, input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.Sqrt()
+   local in1 = torch.rand(5,7)
+   local module = nn.Sqrt()
+   local out = module:forward(in1)
+   local err = out:dist(in1:sqrt())
+   mytester:assertlt(err, 1e-15, torch.typename(module) .. ' - forward err ')
+
+   -- Test zero inputs; we will avoid a div-by-zero by setting to zero
+   local zin = torch.DoubleTensor(5, 7):zero()
+   module:forward(zin)
+   local zgradout = torch.rand(5, 7)
+   local zgradin = module:backward(zin, zgradout)
+   mytester:assertTensorEq(zgradin, torch.DoubleTensor(5, 7):zero(), 0.000001, "error in sqrt backward singularity")
+
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ink, inj, ini):zero()
+
+   local module = nn.Sqrt()
+
+   local err = nn.Jacobian.testJacobian(module, input, 0.1, 2)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = nn.Jacobian.testIO(module, input, 0, 2)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.Linear()
+   local ini = math.random(3,5)
+   local inj_vals = {math.random(3,5), 1}  -- Also test the inj = 1 spatial case
+   local input = torch.Tensor(ini):zero()
+
+   for ind, inj in pairs(inj_vals) do
+      local module = nn.Linear(ini,inj)
+
+      local function jacTests(module)
+         -- 1D
+         local err = jac.testJacobian(module,input)
+         mytester:assertlt(err,precision, 'error on state ')
+
+         local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+         mytester:assertlt(err,precision, 'error on weight ')
+
+         if module.bias then
+            local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+            mytester:assertlt(err,precision, 'error on bias ')
+         end
+
+         local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+         mytester:assertlt(err,precision, 'error on weight [direct update] ')
+
+         if module.bias then
+            local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+            mytester:assertlt(err,precision, 'error on bias [direct update] ')
+         end
+
+         nn.hessian.enable()
+
+         local err = jac.testDiagHessianInput(module, input)
+         mytester:assertlt(err , precision, 'error on diagHessianInput')
+
+         local err = jac.testDiagHessianWeight(module, input)
+         mytester:assertlt(err , precision, 'error on diagHessianWeight')
+
+         if module.bias then
+            local err = jac.testDiagHessianBias(module, input)
+            mytester:assertlt(err , precision, 'error on diagHessianBias')
+         end
+
+         for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+            mytester:assertlt(err, precision, string.format(
+                                 'error on weight [%s]', t))
+         end
+
+         if module.bias then
+            for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+               mytester:assertlt(err, precision, string.format(
+                                    'error on bias [%s]', t))
+            end
+         end
+
+         -- 2D
+         local nframe = math.random(50,70)
+         local input = torch.Tensor(nframe, ini):zero()
+
+         local err = jac.testJacobian(module,input)
+         mytester:assertlt(err,precision, 'error on state ')
+
+         local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+         mytester:assertlt(err,precision, 'error on weight ')
+
+         if module.bias then
+            local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+            mytester:assertlt(err,precision, 'error on bias ')
+         end
+
+         local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+         mytester:assertlt(err,precision, 'error on weight [direct update] ')
+
+         if module.bias then
+            local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+            mytester:assertlt(err,precision, 'error on bias [direct update] ')
+         end
+
+         local err = jac.testDiagHessianInput(module, input)
+         mytester:assertlt(err , precision, 'error on diagHessianInput')
+
+         local err = jac.testDiagHessianWeight(module, input)
+         mytester:assertlt(err , precision, 'error on diagHessianWeight')
+
+         if module.bias then
+            local err = jac.testDiagHessianBias(module, input)
+            mytester:assertlt(err , precision, 'error on diag HessianBias')
+         end
+
+         for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+            mytester:assertlt(err, precision, string.format(
+                                 'error on weight [%s]', t))
+         end
+
+         if module.bias then
+            for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+               mytester:assertlt(err, precision, string.format(
+                                    'error on bias [%s]', t))
+            end
+         end
+
+         -- IO
+         local ferr,berr = jac.testIO(module,input)
+         mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+         mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+      end
+
+      jacTests(module)
+      module:noBias()
+      jacTests(module)
+      module.bias = torch.Tensor(inj):zero()
+      module.gradBias = torch.Tensor(inj):zero()
+      module:reset()
+      jacTests(module)
+   end  -- for ind, inj in pairs(inj_vals) do
+end
+
+local function test_sparse_linear(inb, ini, inj, numNonzero)
+   local module = nn.SparseLinear(ini,inj, true)
+   local linear = nn.Linear(ini, inj)
+   linear.weight = module.weight:clone()
+   linear.bias = module.bias:clone()
+   module:zeroGradParameters()
+   linear:zeroGradParameters()
+
+   -- Create a random sparse vector
+   local input = {}
+   local nonsparse = torch.zeros(inb, ini)
+   for i=1,inb do
+       local nnz = math.random(1, 3) + numNonzero
+       local inds = torch.randperm(ini)[{{1,nnz}}]
+       input[i] = torch.Tensor(nnz, 2)
+       input[i]:select(2,1):copy(inds)
+       input[i]:select(2,2):copy(torch.rand(nnz))
+       nonsparse[i]:scatter(1, input[i]:select(2,1):long(), input[i]:select(2,2))
+   end
+   local gradOutput = torch.rand(inb, inj)
+
+   local cmps = {'weight', 'bias', 'gradWeight', 'gradBias'}
+
+   -- Check output wrt linear, non-batch
+   local actual = module:forward(input[1])
+   local expected = linear:forward(nonsparse[1])
+   local actualgi = module:backward(input[1], gradOutput[1])
+   local expectedgi = linear:backward(nonsparse[1], gradOutput[1])
+   module:updateParameters(1)
+   linear:updateParameters(1)
+   local err = (expected - actual):abs():max()
+   local gierr = (expectedgi - actualgi[1]:select(2,2)):abs():max()
+   mytester:assertle(err, precision, 'error on result')
+   mytester:assertle(gierr, precision, 'error on gradInput')
+
+   for _,var in ipairs(cmps) do
+        local err = (module[var] - linear[var]):abs():max()
+        mytester:assertle(err, precision, 'error on '..var)
+   end
+   module:zeroGradParameters()
+   linear:zeroGradParameters()
+
+   -- Check output wrt linear, batch
+   -- doing this n times checks for fast last input param updates
+   local test_n_times = function(ntimes)
+      local actual, expected, actualgi, expectedgi
+      for i=1, ntimes do
+         actual = module:forward(input)
+         expected = linear:forward(nonsparse)
+         actualgi = module:backward(input, gradOutput)
+         expectedgi = linear:backward(nonsparse, gradOutput)
+      end
+      module:updateParameters(1)
+      linear:updateParameters(1)
+      local err = (expected - actual):abs():max()
+      local gicheck = torch.Tensor():resizeAs(expectedgi)
+      for i=1,#actualgi do gicheck[i]:copy(actualgi[i]:select(2,2)) end
+      local gierr = (expectedgi - gicheck):abs():max()
+      mytester:assertle(err, precision, 'error on result with ntimes = '..ntimes)
+      mytester:assertle(gierr, precision, 'error on gradInput with ntimes = '..ntimes)
+
+      for _,var in ipairs(cmps) do
+          local err = (module[var] - linear[var]):abs():max()
+          mytester:assertle(err, precision, 'error on '..var..' with ntimes = '..ntimes)
+      end
+
+      module:zeroGradParameters()
+      linear:zeroGradParameters()
+      mytester:assertle(module.gradWeight:sum(), precision, 'error zeroing gradweight')
+      mytester:assertle(module.gradBias:sum(), precision, 'error zeroing gradweight')
+
+   end
+
+   test_n_times(1)
+   test_n_times(2)
+   test_n_times(3)
+
+   -- legacy batch mode
+   local batch = math.random(2,5)
+
+   local input = torch.Tensor(batch, numNonzero, 2):zero()
+   for k=1,batch do
+      local N = {}
+      for i = 1, ini do N[i] = i end
+      for i = 1, numNonzero do
+         local j = math.random(i,ini)
+         N[i], N[j] = N[j], N[i]
+      end
+      for i = 1, numNonzero do input[{k,i,1}] = N[i] end
+   end
+   local values = input:select(3,2)
+   values:copy(torch.rand(values:nElement())):mul(2):add(-1)
+
+   -- Check output
+   local actual = module:forward(input):clone()
+   local expected = torch.Tensor(batch, inj)
+   for k = 1, batch do
+      expected[k]:copy(module:forward(input[k]))
+   end
+   local err = (expected - actual):abs():max()
+   mytester:assertle(err, precision, 'error on batch result forward')
+end
+
+function nntest.SparseLinear()
+   local inb = math.random(5,10)
+   local ini = math.random(50,100)
+   local inj = math.random(5,10)
+   local numNonzero = math.random(3,5)
+
+   test_sparse_linear(inb, ini, inj, numNonzero)
+   -- Tests OMP parallelism
+   test_sparse_linear(1, 50000, 10, 20000)
+   test_sparse_linear(1000, 1000, 10, 100)
+end
+
+local function testIndexLinear(bsize, iSize, oSize, nnz)
+   local inb = bsize
+   local ini = iSize
+   local inj = oSize
+
+   local ilinear  = nn.IndexLinear(ini,inj, true, nil, nil, nil, false)
+   local ilinear2 = nn.IndexLinear(ini,inj, true, nil, nil, nil, false)
+   local linear = nn.Linear(ini, inj)
+   ilinear.weight:zero()
+   ilinear.weight:copy(linear.weight:t():clone())
+   ilinear.bias = linear.bias:clone()
+   ilinear:zeroGradParameters()
+
+   ilinear2.weight:zero()
+   ilinear2.weight:copy(linear.weight:t():clone())
+   ilinear2.bias = linear.bias:clone()
+   ilinear2:zeroGradParameters()
+
+   linear:zeroGradParameters()
+
+   -- Create a random sparse vector
+   local input = {{},{}}
+   local flatInput = {torch.LongTensor(), torch.Tensor(), torch.LongTensor()}
+   local nonsparse = torch.zeros(inb, ini)
+   local sizes = flatInput[3]
+   sizes:resize(inb)
+   for i=1,inb do
+      sizes[i] = nnz
+      input[1][i] = torch.randperm(ini)[{{1,nnz}}]:long()
+      input[2][i] = torch.ones(nnz):uniform()
+      nonsparse[i]:scatter(1, input[1][i], input[2][i])
+   end
+   flatInput[1]:cat(input[1])
+   flatInput[2]:cat(input[2])
+
+   local gradOutput = torch.rand(inb, inj)
+   local cmps = {'weight', 'bias', 'gradBias'}
+   -- Check output wrt linear, non-batch
+   local actual = ilinear:forward({input[1][1], input[2][1]})
+   local actual2 = ilinear2:forward({input[1][1], input[2][1], flatInput[3][1]})
+   local expected = linear:forward(nonsparse[1])
+
+   local actualgi = ilinear:backward({input[1][1], input[2][1]}, gradOutput[1])
+   local actualgi2 = ilinear2:backward({input[1][1], input[2][1], flatInput[3][1]}, gradOutput[1])
+   local expectedgi = linear:backward(nonsparse[1], gradOutput[1])
+
+   ilinear:updateParameters(1)
+   ilinear2:updateParameters(1)
+   linear:updateParameters(1)
+
+   local err = (expected - actual):abs():max()
+   local err2 = (expected - actual2):abs():max()
+
+   local gierr = (expectedgi - actualgi[2]):abs():max()
+   local gierr2 = (expectedgi - actualgi2[2]):abs():max()
+
+   mytester:assertle(err, precision, 'error on result for tensor array')
+   mytester:assertle(gierr, precision, 'error on gradInput for tensor array')
+
+   mytester:assertle(err2, precision, 'error on result for batched tensor')
+   mytester:assertle(gierr2, precision, 'error on gradInput for batched tensor')
+
+   for _,var in ipairs(cmps) do
+      local err, err2
+      if var == 'weight' then
+         err = (ilinear[var]:t() - linear[var]):abs():max()
+         err2 = (ilinear2[var]:t() - linear[var]):abs():max()
+      else
+         err = (ilinear[var] - linear[var]):abs():max()
+         err2 = (ilinear2[var] - linear[var]):abs():max()
+      end
+      mytester:assertle(err, precision, 'error on '..var..' for tensor array')
+      mytester:assertle(err2, precision, 'error on '..var..' for batched tensor')
+   end
+   ilinear:zeroGradParameters()
+   ilinear2:zeroGradParameters()
+   linear:zeroGradParameters()
+
+   -- Check output wrt linear, batch
+   -- doing this n times checks for fast last input param updates
+   local test_n_times = function(ntimes)
+      local actual, expected, actualgi, expectedgi
+      for i=1, ntimes do
+         actual = ilinear:forward(input)
+         actual2 = ilinear2:forward(flatInput)
+         expected = linear:forward(nonsparse)
+
+         actualgi = ilinear:backward(input, gradOutput)
+         actualgi2 = ilinear2:backward(flatInput, gradOutput)
+         expectedgi = linear:backward(nonsparse, gradOutput)
+      end
+      ilinear:updateParameters(1)
+      ilinear2:updateParameters(1)
+      linear:updateParameters(1)
+
+      local err = (expected - actual):abs():max()
+      local err2 = (expected - actual2):abs():max()
+
+      local gicheck = torch.Tensor():resizeAs(expectedgi)
+      local gicheck2 = actualgi2[2]
+
+      for i=1,#actualgi[2] do
+         gicheck[i]:copy(actualgi[2][i])
+      end
+      local gierr = (expectedgi - gicheck):abs():max()
+      local gierr2 = (expectedgi - gicheck2):abs():max()
+
+      mytester:assertle(err, precision, 'error on result for tensor array with ntimes = '..ntimes)
+      mytester:assertle(err2, precision, 'error on result for batched tensor with ntimes = '..ntimes)
+
+      mytester:assertle(gierr, precision, 'error on gradInput for tensor array with ntimes = '..ntimes)
+      mytester:assertle(gierr2, precision, 'error on gradInput for batched tensor with ntimes = '..ntimes)
+
+      for _,var in ipairs(cmps) do
+         local err, err2
+         if var == 'weight' then
+            err = (ilinear[var]:t() - linear[var]):abs():max()
+            err2 = (ilinear2[var]:t() - linear[var]):abs():max()
+         else
+            err = (ilinear[var] - linear[var]):abs():max()
+            err2 = (ilinear2[var] - linear[var]):abs():max()
+         end
+         mytester:assertle(err, precision, 'error on '..var..' for tensor array')
+         mytester:assertle(err2, precision, 'error on '..var..' for batched tensor')
+      end
+
+      ilinear:zeroGradParameters()
+      ilinear2:zeroGradParameters()
+      linear:zeroGradParameters()
+      mytester:assertle(ilinear.gradBias:sum(), precision, 'error zeroing gradbias for tensor array')
+      mytester:assertle(ilinear2.gradBias:sum(), precision, 'error zeroing gradbias for batched tensor')
+   end
+   test_n_times(1)
+   test_n_times(2)
+   test_n_times(3)
+end
+
+function nntest.IndexLinear()
+   testIndexLinear(4, 40 , 10, 30)
+   testIndexLinear(4, 40 , 500, 30)
+   testIndexLinear(4, 200000 , 5, 150000)
+
+   local sizes = {
+      {osize = 1, isize = 10000, nnz = 10000, bsize = 16},
+      {osize = 10, isize = 10000, nnz = 10000, bsize = 16},
+      {osize = 100, isize = 10000, nnz = 10000, bsize = 16},
+
+      {osize = 1, isize = 10000, nnz = 200000, bsize = 1},
+      {osize = 10, isize = 10000, nnz = 200000, bsize = 1},
+      {osize = 100, isize = 10000, nnz = 200000, bsize = 1},
+
+      {osize = 1, isize = 10000, nnz = 200000, bsize = 2},
+      {osize = 10, isize = 10000, nnz = 200000, bsize = 2},
+      {osize = 100, isize = 10000, nnz = 200000, bsize = 2},
+   }
+
+   for i, lsizes in ipairs(sizes) do
+      -- Test multithreaded updates
+      local isize = lsizes.isize
+      local osize = lsizes.osize
+      local il = nn.IndexLinear(isize, osize)
+      local batch = {{},{}}
+      local idx = 100
+      local nnz = lsizes.nnz
+      local bsize = lsizes.bsize
+      for i=1,bsize do
+         batch[1][i] = torch.LongTensor(nnz):fill(idx)
+         batch[2][i] = torch.DoubleTensor(nnz):fill(1)
+      end
+      local totalSize = bsize*nnz
+      local lr = 0.01
+      -- Update the same index all over
+      local out = il:updateOutput(batch)
+      out:fill(1)
+      il:backwardUpdate(batch, out, lr)
+      il:backward(batch, out, 1)
+      il:updateParameters(lr)
+      for i=1,osize do
+         mytester:assertlt(math.abs(il.weight[idx][i] + totalSize * lr * 2), precision, 'parameters update was wrong.')
+      end
+   end
+end
+
+function nntest.Bilinear()
+
+   -- set up data:
+   local N = 10
+   local D1 = 5
+   local D2 = 4
+   local K  = 3
+   local input  = {torch.randn(N, D1), torch.randn(N, D2)}
+   local target = torch.randn(N, K)
+
+   -- test forward
+   local module = nn.Bilinear(D1, D2, K)
+   local expected = torch.zeros(N,K)
+   for k = 1, K do
+      local temp = torch.mm(module.weight[k], input[2]:t())
+      temp:cmul(input[1]:t())
+      temp = temp:sum(1)
+      temp:add(module.bias[k])
+      expected[{{},k}] = temp:view(-1)
+   end
+   local output = module:forward(input)
+   mytester:assertTensorEq(expected, output, 0.000001, 'Bilinear forward 2D err')
+
+   -- For testing grads we'll follow the nn.DotProduct strategy of using a SplitTable
+   local input2 = torch.randn(2, N, D1)
+   local module2 = nn.Sequential()
+   module2:add(nn.SplitTable(1))
+   module2:add(nn.ParallelTable():add(nn.Linear(D1,D1)):add(nn.Linear(D1,D2)))
+   module2:add(nn.Bilinear(D1, D2, K))
+   module2:add(nn.Linear(K,1))
+
+   local err = jac.testJacobian(module2, input2)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module2, input2, module2:get(3).weight, module2:get(3).gradWeight)
+   mytester:assertlt(err, precision, 'error on weight ')
+
+   local err = jac.testJacobianParameters(module2, input2, module2:get(3).bias, module2:get(3).gradBias)
+   mytester:assertlt(err, precision, 'error on bias ')
+
+end
+
+function nntest.PartialLinear()
+
+   -- settings for experiment:
+   local N = 10
+   local D = 5
+   local K = 15
+
+   -- test forward-backward pass of module:
+   local module = nn.PartialLinear(D, K)
+   for sub_K = 1,K do
+
+      -- get random test case:
+      local input  = torch.randn(N, D)
+      local partition = torch.randperm(K):narrow(1, 1, sub_K)
+
+      -- do forward-backward pass:
+      module:setPartition(partition)
+      module:forward(input)
+      mytester:asserteq(module.output:size(1), N)
+      mytester:asserteq(module.output:size(2), sub_K)
+      module:backward(input, torch.ones(N, sub_K))
+      mytester:asserteq(module.gradInput:size(1), input:size(1))
+      mytester:asserteq(module.gradInput:size(2), input:size(2))
+
+      -- do parameter update:
+      local lr = .01
+      module:updateParameters(lr)
+   end
+   module:resetPartition()
+
+   -- compare output with linear layer:
+   local module2 = nn.Linear(D, K)
+   module2.weight:copy(module.network:get(1):get(2).weight)
+   module2.bias:fill(0)
+   if module.bias then module2.bias:copy(module.bias) end
+   local input = torch.randn(N, D)
+   local diff = (module:forward(input) - module2:forward(input)):abs():sum()
+   mytester:assertlt(diff, 1e-7)
+
+   -- gradient checks:
+   local sub_K = 5
+   local partition = torch.randperm(K):narrow(1, 1, sub_K)
+   module:setPartition(partition)
+   local err = sjac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local err = sjac.testJacobianParameters(module, input, module.network:get(1):get(2).weight, module.network:get(1):get(2).gradWeight)
+   mytester:assertlt(err,precision, 'error on weight ')
+
+   local err = sjac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err,precision, 'error on bias ')
+
+   local err = sjac.testJacobianUpdateParameters(module, input, module.network:get(1):get(2).weight)
+   mytester:assertlt(err,precision, 'error on weight [direct update] ')
+
+   local err = sjac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err,precision, 'error on bias [direct update] ')
+
+   local ferr, berr = sjac.testIO(module, input)
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.Euclidean()
+   local ini = math.random(5,7)
+   local inj = math.random(5,7)
+   local input = torch.randn(ini)
+   local gradOutput = torch.randn(inj)
+   local module = nn.Euclidean(ini,inj)
+   local output = module:forward(input):clone()
+
+   local output2 = torch.Tensor(inj):zero()
+   for o = 1,module.weight:size(2) do
+      output2[o] = input:dist(module.weight:select(2,o))
+   end
+   mytester:assertTensorEq(output, output2, 0.000001, 'Euclidean forward 1D err')
+
+   local input2 = torch.randn(8, ini)
+   input2[2]:copy(input)
+   local output2 = module:forward(input2)
+   mytester:assertTensorEq(output2[2], output, 0.000001, 'Euclidean forward 2D err')
+
+   local output = module:forward(input):clone()
+   module:zeroGradParameters()
+   local gradInput = module:backward(input, gradOutput, 1):clone()
+   local gradInput2 = torch.zeros(ini)
+   local temp = input:clone()
+   for o = 1,module.weight:size(2) do
+      temp:copy(input)
+      temp:add(-1,module.weight:select(2,o))
+      temp:mul(gradOutput[o]/output[o])
+      gradInput2:add(temp)
+   end
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001, 'Euclidean updateGradInput 1D err')
+
+   local gradWeight = module.gradWeight:clone():zero()
+   for o = 1,module.weight:size(2) do
+      temp:copy(module.weight:select(2,o)):add(-1,input)
+      temp:mul(gradOutput[o]/output[o])
+      gradWeight:select(2,o):add(1, temp)
+   end
+   mytester:assertTensorEq(gradWeight, module.gradWeight, 0.000001, 'Euclidean accGradParameters 1D err')
+
+   local input2 = input:view(1, -1):repeatTensor(8, 1)
+   local gradOutput2 = gradOutput:view(1, -1):repeatTensor(8, 1)
+   local output2 = module:forward(input2)
+   module:zeroGradParameters()
+   local gradInput2 = module:backward(input2, gradOutput2, 1/8)
+   mytester:assertTensorEq(gradInput2[2], gradInput, 0.000001, 'Euclidean updateGradInput 2D err')
+
+   mytester:assertTensorEq(gradWeight, module.gradWeight, 0.000001, 'Euclidean accGradParameters 2D err')
+
+   input:zero()
+   module.fastBackward = false
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err,precision, 'error on weight ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.WeightedEuclidean()
+   local ini = math.random(5,7)
+   local inj = math.random(5,7)
+   local input = torch.randn(ini)
+   local gradOutput = torch.randn(inj)
+   local module = nn.WeightedEuclidean(ini,inj)
+
+   local output = module:forward(input):clone()
+
+   local output2 = torch.Tensor(inj):zero()
+   local temp = input:clone()
+   for o = 1,module.weight:size(2) do
+      temp:copy(input):add(-1,module.weight:select(2,o))
+      temp:cmul(temp)
+      temp:cmul(module.diagCov:select(2,o)):cmul(module.diagCov:select(2,o))
+      output2[o] = math.sqrt(temp:sum())
+   end
+   mytester:assertTensorEq(output, output2, 0.000001, 'WeightedEuclidean forward 1D err')
+
+   local input2 = torch.randn(8, ini)
+   input2[2]:copy(input)
+   local output2 = module:forward(input2)
+   mytester:assertTensorEq(output2[2], output, 0.000001, 'WeightedEuclidean forward 2D err')
+
+   local output = module:forward(input):clone()
+   module:zeroGradParameters()
+   local gradInput = module:backward(input, gradOutput, 1):clone()
+   local gradInput2 = torch.zeros(ini)
+   for o = 1,module.weight:size(2) do
+      temp:copy(input)
+      temp:add(-1,module.weight:select(2,o))
+      temp:cmul(module.diagCov:select(2,o)):cmul(module.diagCov:select(2,o))
+      temp:mul(gradOutput[o]/output[o])
+      gradInput2:add(temp)
+   end
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001, 'WeightedEuclidean updateGradInput 1D err')
+
+   local gradWeight = module.gradWeight:clone():zero()
+   local gradDiagCov = module.gradDiagCov:clone():zero()
+   for o = 1,module.weight:size(2) do
+      if output[o] ~= 0 then
+         temp:copy(module.weight:select(2,o)):add(-1,input)
+         temp:cmul(module.diagCov:select(2,o)):cmul(module.diagCov:select(2,o))
+         temp:mul(gradOutput[o]/output[o])
+         gradWeight:select(2,o):add(temp)
+
+         temp:copy(module.weight:select(2,o)):add(-1,input)
+         temp:cmul(temp)
+         temp:cmul(module.diagCov:select(2,o))
+         temp:mul(gradOutput[o]/output[o])
+         gradDiagCov:select(2,o):add(temp)
+      end
+   end
+   mytester:assertTensorEq(gradWeight, module.gradWeight, 0.000001, 'WeightedEuclidean accGradParameters gradWeight 1D err')
+   mytester:assertTensorEq(gradDiagCov, module.gradDiagCov, 0.000001, 'WeightedEuclidean accGradParameters gradDiagCov 1D err')
+
+   local input2 = input:view(1, -1):repeatTensor(8, 1)
+   local gradOutput2 = gradOutput:view(1, -1):repeatTensor(8, 1)
+   local output2 = module:forward(input2)
+   module:zeroGradParameters()
+   local gradInput2 = module:backward(input2, gradOutput2, 1/8)
+   mytester:assertTensorEq(gradInput2[2], gradInput, 0.000001, 'WeightedEuclidean updateGradInput 2D err')
+
+   mytester:assertTensorEq(gradWeight, module.gradWeight, 0.000001, 'WeightedEuclidean accGradParameters gradWeight 2D err')
+   mytester:assertTensorEq(gradDiagCov, module.gradDiagCov, 0.000001, 'WeightedEuclidean accGradParameters gradDiagCov 2D err')
+
+   input:zero()
+   module.fastBackward = false
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err,precision, 'error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.diagCov, module.gradDiagCov)
+   mytester:assertlt(err,precision, 'error on bias ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+
+   input:zero()
+   module:zeroGradParameters()
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err,precision, 'error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.diagCov, module.gradDiagCov)
+   mytester:assertlt(err,precision, 'error on bias ')
+
+   local ferr,berr = jac.testIO(module,input2)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+local function criterionJacobianTest(cri, input, target)
+   local eps = 1e-6
+   local _ = cri:forward(input, target)
+   local dfdx = cri:backward(input, target)
+   -- for each input perturbation, do central difference
+   local centraldiff_dfdx = torch.Tensor():resizeAs(dfdx)
+   local input_s = input:storage()
+   local centraldiff_dfdx_s = centraldiff_dfdx:storage()
+   for i=1,input:nElement() do
+      -- f(xi + h)
+      input_s[i] = input_s[i] + eps
+      local fx1 = cri:forward(input, target)
+      -- f(xi - h)
+      input_s[i] = input_s[i] - 2*eps
+      local fx2 = cri:forward(input, target)
+      -- f'(xi) = (f(xi + h) - f(xi - h)) / 2h
+      local cdfx = (fx1 - fx2) / (2*eps)
+      -- store f' in appropriate place
+      centraldiff_dfdx_s[i] = cdfx
+      -- reset input[i]
+      input_s[i] = input_s[i] + eps
+   end
+
+   -- compare centraldiff_dfdx with :backward()
+   local err = (centraldiff_dfdx - dfdx):abs():max()
+   mytester:assertlt(err, precision, 'error in difference between central difference and :backward')
+end
+
+local function criterionJacobianTest1DTable(cri, input0, target)
+   -- supposes input is a tensor, which is splitted in the first dimension
+   local input = input0:split(1,1)
+   for i=1,#input do
+      input[i] = input[i][1]
+   end
+   local eps = 1e-6
+   local _ = cri:forward(input, target)
+   local dfdx = cri:backward(input, target)
+   -- for each input perturbation, do central difference
+   local centraldiff_dfdx = torch.Tensor():resizeAs(input0)
+   local input_s = input0:storage()
+   local centraldiff_dfdx_s = centraldiff_dfdx:storage()
+   for i=1,input0:nElement() do
+      -- f(xi + h)
+      input_s[i] = input_s[i] + eps
+      local fx1 = cri:forward(input, target)
+      -- f(xi - h)
+      input_s[i] = input_s[i] - 2*eps
+      local fx2 = cri:forward(input, target)
+      -- f'(xi) = (f(xi + h) - f(xi - h)) / 2h
+      local cdfx = (fx1 - fx2) / (2*eps)
+      -- store f' in appropriate place
+      centraldiff_dfdx_s[i] = cdfx
+      -- reset input[i]
+      input_s[i] = input_s[i] + eps
+   end
+   local centraldiff_dfdx_t = centraldiff_dfdx:split(1,1)
+   for i=1,#centraldiff_dfdx_t do
+      centraldiff_dfdx_t[i] = centraldiff_dfdx_t[i][1]
+   end
+   for i=1,#centraldiff_dfdx_t do
+      -- compare centraldiff_dfdx with :backward()
+      local err = (centraldiff_dfdx_t[i] - dfdx[i]):abs():max()
+      mytester:assertlt(err, precision, 'error in difference between central difference and :backward')
+   end
+end
+
+function nntest.SmoothL1Criterion()
+   local input = torch.rand(10)
+   local target = input:clone():add(torch.rand(10))
+   local cri = nn.SmoothL1Criterion()
+   criterionJacobianTest(cri, input, target)
+end
+
+function nntest.MSECriterion()
+   local input = torch.rand(10)
+   local target = input:clone():add(torch.rand(10))
+   local cri = nn.MSECriterion()
+   criterionJacobianTest(cri, input, target)
+end
+
+function nntest.SpatialAutoCropMSECriterion()
+   -- Tests the assumptions on input and target dimensions for the
+   -- nn.SpatialAutoCropMSECriterion criterion
+   local function testInputBounds()
+      for _, average in pairs({true, false}) do
+         local sMSE = nn.SpatialAutoCropMSECriterion(average)
+
+         local input = torch.Tensor(3, 3, 3)
+         local target = torch.Tensor(4, 3, 3)
+         mytester:assertError(function() sMSE:forward(input, target) end,
+                          "Target and input must have same number of channels")
+
+         input = torch.Tensor(2, 4, 3, 3)
+         target = torch.Tensor(2, 3, 3, 3)
+         mytester:assertError(function() sMSE:forward(input, target) end,
+                        "Target and input must have same number of channels")
+
+         input = torch.Tensor(2, 3, 3, 3)
+         target = torch.Tensor(1, 3, 3, 3)
+         mytester:assertError(function() sMSE:forward(input, target) end,
+                         "Target and input must have same batch size")
+
+         input = torch.Tensor(2, 5, 5)
+         target = torch.Tensor(2, 5, 4)
+         mytester:assertError(function() sMSE:forward(input, target) end,
+                         "input resolution must be smaller or equal to the spatial resolution of the target")
+
+         input = torch.Tensor(1, 2, 5, 5)
+         target = torch.Tensor(1, 2, 4, 5)
+         mytester:assertError(function() sMSE:forward(input, target) end,
+                         "input resolution must be smaller or equal to the spatial resolution of the target")
+      end
+   end
+
+   -- Tests that the forward pass of nn.SpatialAutoCropMSECriterion
+   -- is equivalent to the forward pass of nn.MSECriterion with a pre-cropped target
+   local function testSpatialAutoCropMSECriterionBatched()
+      for _, average in pairs({true, false}) do
+         local sMSE = nn.SpatialAutoCropMSECriterion(average)
+         local MSE = nn.MSECriterion(average)
+
+         local batchSize = math.random(1,10)
+         local channels = math.random(1,10)
+         local inputHeight = math.random(1, 50)
+         local inputWidth = math.random(1, 50)
+         local targetHeight = inputHeight + math.random(0,5)
+         local targetWidth = inputWidth + math.random(0,5)
+
+         local input = torch.Tensor(batchSize, channels, inputHeight, inputWidth):uniform()
+         local target = torch.Tensor(batchSize, channels, targetHeight, targetWidth):uniform()
+
+         local heightStartIdx = 1 + math.floor((targetHeight - inputHeight)/2.0)
+         local heightEndIdx = heightStartIdx + inputHeight - 1
+         local widthStartIdx = 1 +  math.floor((targetWidth - inputWidth)/2.0)
+         local widthEndIdx = widthStartIdx + inputWidth - 1
+
+         local croppedTarget = target[{{}, {}, {heightStartIdx, heightEndIdx}, {widthStartIdx, widthEndIdx}}]
+
+         local sMSEOut = nn.SpatialAutoCropMSECriterion(average):forward(input, target)
+         local MSEOut = MSE:forward(input, croppedTarget)
+         mytester:asserteq(sMSEOut, MSEOut)
+
+         local gradOutput = torch.Tensor():resizeAs(croppedTarget):uniform()
+         local sMSEGradInput = sMSE:backward(input, gradOutput)
+         local MSEGradInput = MSE:backward(input, gradOutput)
+         mytester:assertTensorEq(sMSEGradInput, MSEGradInput, 1e-7)
+         criterionJacobianTest(sMSE, input, gradOutput)
+      end
+   end
+
+   local function testSpatialAutoCropMSECriterionNonBatched()
+      for _, average in pairs({true, false}) do
+         local sMSE = nn.SpatialAutoCropMSECriterion(average)
+         local MSE = nn.MSECriterion(average)
+
+         local channels = math.random(1,10)
+         local inputHeight = math.random(1, 50)
+         local inputWidth = math.random(1, 50)
+         local targetHeight = inputHeight + math.random(0,5)
+         local targetWidth = inputWidth + math.random(0,5)
+
+         local input = torch.Tensor(channels, inputHeight, inputWidth):uniform()
+         local target = torch.Tensor(channels, targetHeight, targetWidth):uniform()
+
+         local heightStartIdx = 1 + math.floor((targetHeight - inputHeight)/2.0)
+         local heightEndIdx = heightStartIdx + inputHeight - 1
+         local widthStartIdx = 1 +  math.floor((targetWidth - inputWidth)/2.0)
+         local widthEndIdx = widthStartIdx + inputWidth - 1
+
+         local croppedTarget = target[{{}, {heightStartIdx, heightEndIdx}, {widthStartIdx, widthEndIdx}}]
+
+         local sMSEOut = nn.SpatialAutoCropMSECriterion(average):forward(input, target)
+         local MSEOut = MSE:forward(input, croppedTarget)
+         mytester:asserteq(sMSEOut, MSEOut)
+
+         local gradOutput = torch.Tensor():resizeAs(croppedTarget):uniform()
+         local sMSEGradInput = sMSE:backward(input, gradOutput)
+         local MSEGradInput = MSE:backward(input, gradOutput)
+         mytester:assertTensorEq(sMSEGradInput, MSEGradInput, 1e-7)
+         criterionJacobianTest(sMSE, input, gradOutput)
+      end
+   end
+
+   testInputBounds()
+   testSpatialAutoCropMSECriterionBatched()
+   testSpatialAutoCropMSECriterionNonBatched()
+end
+
+function nntest.ClassSimplexCriterion()
+   local nClasses = torch.random(3,15)
+   local input = torch.rand(nClasses)
+   local target = torch.random(1,nClasses)
+   local cri = nn.ClassSimplexCriterion(nClasses)
+   criterionJacobianTest(cri, input, target)
+end
+
+
+function nntest.MarginCriterion()
+   local input = torch.rand(100)
+   local target = input:clone():add(torch.rand(100))
+   local cri = nn.MarginCriterion()
+   criterionJacobianTest(cri, input, target)
+end
+
+function nntest.SoftMarginCriterion()
+   local input = torch.rand(100)
+   local target = input:clone():add(torch.rand(100))
+   local cri = nn.SoftMarginCriterion()
+   criterionJacobianTest(cri, input, target)
+end
+
+function nntest.MultiMarginCriterion()
+   local input = torch.rand(100)
+   local target = math.random(1,100)
+   local cri = nn.MultiMarginCriterion(math.random(1,2), nil, 0.1)
+   criterionJacobianTest(cri, input, target)
+
+   local cri = nn.MultiMarginCriterion()
+   criterionJacobianTest(cri, input, target)
+
+   local cri = nn.MultiMarginCriterion(2)
+   criterionJacobianTest(cri, input, target)
+
+   local weights = torch.randn(100)
+   local cri = nn.MultiMarginCriterion(1, weights)
+end
+
+function nntest.MarginRankingCriterion()
+   local input = {torch.rand(1), torch.rand(1)}
+   local mrc = nn.MarginRankingCriterion()
+   local output = mrc:forward(input, 1)
+   local gradInput = mrc:backward(input, 1)
+   -- cast to float
+   local input2 = {input[1]:float(), input[2]:float()}
+   local mrc2 = mrc:clone():float()
+   local output2 = mrc2:forward(input2, 1)
+   local gradInput2 = mrc2:backward(input2, 1)
+   mytester:assert(math.abs(output2 - output) < 0.00001, "MRC:type() forward error")
+   mytester:assertTensorEq(gradInput[1]:float(), gradInput2[1], 0.00001, "MRC:type() backward error 1")
+   mytester:assert(torch.type(gradInput2[1]) == 'torch.FloatTensor', "MRC:type() error 1")
+   mytester:assertTensorEq(gradInput[2]:float(), gradInput2[2], 0.00001, "MRC:type() backward error 2")
+   mytester:assert(torch.type(gradInput2[2]) == 'torch.FloatTensor', "MRC:type() error 2")
+
+   -- batch, sizeAverage true, jacobian
+   local margin = math.random() * 2 - 1
+   local batch_size = math.random(1,10)
+   local crit = nn.MarginRankingCriterion(margin)
+   crit.sizeAverage = true
+   local v = torch.rand(2, batch_size)
+   local t = torch.Tensor(batch_size):random(0,1):mul(2):add(-1)
+   criterionJacobianTest1DTable(crit,v,t)
+
+   -- batch, sizeAverage false, jacobian
+   local margin = math.random() * 2 - 1
+   local crit = nn.MarginRankingCriterion(margin)
+   crit.sizeAverage = false
+   local v = torch.rand(2, batch_size)
+   local t = torch.Tensor(batch_size):random(0,1):mul(2):add(-1)
+   criterionJacobianTest1DTable(crit,v,t)
+end
+
+function nntest.ModuleCriterion()
+   local input = torch.randn(8,4)
+   local target = torch.randn(8,4)
+   local inputModule = nn.Tanh()
+   local criterion = nn.MSECriterion()
+   local mc = nn.ModuleCriterion(criterion, inputModule)
+
+   local err = mc:forward(input, target)
+   local gradInput = mc:backward(input, target)
+
+   local output = inputModule:forward(input)
+   local err2 = criterion:forward(output, target)
+   local gradOutput = criterion:backward(output, target)
+   local gradInput2 = inputModule:backward(input, gradOutput)
+
+   mytester:assert(err == err2, "ModuleCriterion backward err")
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001, "ModuleCriterion backward err")
+end
+
+function nntest.MaskedSelect()
+   local input = torch.randn(4, 5)
+   local mask = torch.ByteTensor(4, 5):bernoulli()
+   local module = nn.MaskedSelect()
+   local out = module:forward({input, mask})
+   local err = out:dist(input:maskedSelect(mask))
+   mytester:assertlt(err, 1e-15, torch.typename(module) .. ' - forward err ')
+
+   local gradOut = torch.Tensor({20, 80})
+   input = torch.Tensor({{10, 20}, {30, 40}})
+   local inTarget = torch.Tensor({{20, 0}, {0, 80}})
+   local mask = torch.ByteTensor({{1, 0}, {0, 1}})
+   local module = nn.MaskedSelect()
+   module:forward({input, mask})
+   local gradIn = module:backward({input, mask}, gradOut)
+   mytester:assertTensorEq(inTarget, gradIn[1], 1e-15, torch.typename(module) .. ' - backward err ')
+end
+
+function nntest.ParallelCriterion()
+   local input = {torch.rand(2,10), torch.randn(2,10)}
+   local target = {torch.IntTensor{1,8}, torch.randn(2,10)}
+   local nll = nn.ClassNLLCriterion()
+   local mse = nn.MSECriterion()
+   local pc = nn.ParallelCriterion():add(nll, 0.5):add(mse)
+   local output = pc:forward(input, target)
+   local output2 = nll:forward(input[1], target[1])/2 + mse:forward(input[2], target[2])
+   mytester:assert(math.abs(output2 - output) < 0.00001, "ParallelCriterion forward error")
+   local gradInput2 = {nll:backward(input[1], target[1]):clone():div(2), mse:backward(input[2], target[2])}
+   local gradInput = pc:backward(input, target)
+   mytester:assertTensorEq(gradInput[1], gradInput2[1], 0.000001, "ParallelCriterion backward error 1")
+   mytester:assertTensorEq(gradInput[2], gradInput2[2], 0.000001, "ParallelCriterion backward error 2")
+
+   -- test type
+   pc:float()
+   gradInput[1], gradInput[2] = gradInput[1]:clone(), gradInput[2]:clone()
+   local input3 = {input[1]:float(), input[2]:float()}
+   local target3 = {target[1]:float(), target[2]:float()}
+   local output3 = pc:forward(input3, target3)
+   local gradInput3 = pc:backward(input3, target3)
+   mytester:assert(math.abs(output3 - output) < 0.00001, "ParallelCriterion forward error type")
+   mytester:assertTensorEq(gradInput[1]:float(), gradInput3[1], 0.000001, "ParallelCriterion backward error 1 type")
+   mytester:assertTensorEq(gradInput[2]:float(), gradInput3[2], 0.000001, "ParallelCriterion backward error 2 type")
+
+   -- test repeatTarget
+   local input = {torch.rand(2,10), torch.randn(2,10)}
+   local target = torch.randn(2,10)
+   local mse = nn.MSECriterion()
+   local pc = nn.ParallelCriterion(true):add(mse, 0.5):add(mse:clone())
+   local output = pc:forward(input, target)
+   local output2 = mse:forward(input[1], target)/2 + mse:forward(input[2], target)
+   mytester:assert(math.abs(output2 - output) < 0.00001, "ParallelCriterion repeatTarget forward error")
+   local gradInput = pc:backward(input, target)
+   local gradInput2 = {mse:backward(input[1], target):clone():div(2), mse:backward(input[2], target)}
+   mytester:assertTensorEq(gradInput[1], gradInput2[1], 0.000001, "ParallelCriterion repeatTarget backward error 1")
+   mytester:assertTensorEq(gradInput[2], gradInput2[2], 0.000001, "ParallelCriterion repeatTarget backward error 2")
+
+   -- table input
+   local input = {torch.randn(2,10), {torch.rand(2,10), torch.randn(2,10)}}
+   local target = {torch.IntTensor{2,5}, {torch.IntTensor{1,8}, torch.randn(2,10)}}
+   local nll2 = nn.ClassNLLCriterion()
+   local nll = nn.ClassNLLCriterion()
+   local mse = nn.MSECriterion()
+   local pc = nn.ParallelCriterion():add(nll, 0.5):add(mse)
+   local pc2 = nn.ParallelCriterion():add(nll2, 0.4):add(pc)
+   local output = pc2:forward(input, target)
+   local output2 = nll2:forward(input[1], target[1])*0.4 + nll:forward(input[2][1], target[2][1])/2 + mse:forward(input[2][2], target[2][2])
+   mytester:assert(math.abs(output2 - output) < 0.00001, "ParallelCriterion table forward error")
+   local gradInput2 = {
+       nll2:backward(input[1], target[1]):clone():mul(0.4),
+      {nll:backward(input[2][2], target[2][1]):clone():div(2), mse:backward(input[2][2], target[2][2])}
+   }
+   local gradInput = pc2:backward(input, target)
+   mytester:assertTensorEq(gradInput[1], gradInput2[1], 0.000001, "ParallelCriterion table backward error 1")
+   mytester:assertTensorEq(gradInput[2][1], gradInput2[2][1], 0.000001, "ParallelCriterion table backward error 2")
+   mytester:assertTensorEq(gradInput[2][2], gradInput2[2][2], 0.000001, "ParallelCriterion table backward error 3")
+end
+
+function nntest.MultiCriterion()
+   local input = torch.rand(2,10)
+   local target = torch.IntTensor{1,8}
+   local nll = nn.ClassNLLCriterion()
+   local nll2 = nn.CrossEntropyCriterion()
+   local mc = nn.MultiCriterion():add(nll, 0.5):add(nll2)
+   local output = mc:forward(input, target)
+   local output2 = nll:forward(input, target)/2 + nll2:forward(input, target)
+   mytester:assert(math.abs(output2 - output) < 0.00001, "MultiCriterion forward error")
+   local gradInput = mc:backward(input, target)
+   local gradInput2 = nll:backward(input, target):clone():div(2):add(nll2:backward(input, target))
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001, "MultiCriterion backward error ")
+
+   -- test type
+   mc:float()
+   gradInput = gradInput:clone()
+   local input3 = input:float()
+   local target3 = target:float()
+   local output3 = mc:forward(input3, target3)
+   local gradInput3 = mc:backward(input3, target3)
+   mytester:assert(math.abs(output3 - output) < 0.00001, "MultiCriterion forward error type")
+   mytester:assertTensorEq(gradInput:float(), gradInput3, 0.000001, "MultiCriterion backward error type")
+
+   -- test table input
+   mc:double()
+   local input = {torch.randn(2,10), {torch.randn(2,10), torch.randn(2,10)}}
+   local target = {torch.IntTensor{1,8}, {torch.IntTensor{5,6}, torch.IntTensor{4,3}}}
+   local pnllc = nn.ParallelCriterion():add(nll):add(nn.ParallelCriterion():add(nll:clone()):add(nll:clone()))
+   local pnllc2 = nn.ParallelCriterion():add(nll2):add(nn.ParallelCriterion():add(nll2:clone()):add(nll2:clone()))
+   local mc = nn.MultiCriterion():add(pnllc, 0.5):add(pnllc2)
+   local output = mc:forward(input, target)
+   local output2 = pnllc:forward(input, target)/2 + pnllc2:forward(input, target)
+   mytester:assert(math.abs(output2 - output) < 0.00001, "MultiCriterion forward table error")
+   local gradInput = mc:backward(input, target)
+   local gradInput2 = pnllc:clone():backward(input, target)
+   local gradInput2b = pnllc2:backward(input, target)
+   gradInput2[1]:div(2):add(gradInput2b[1])
+   gradInput2[2][1]:div(2):add(gradInput2b[2][1])
+   gradInput2[2][2]:div(2):add(gradInput2b[2][2])
+   mytester:assertTensorEq(gradInput[1], gradInput2[1], 0.000001, "MultiCriterion backward table 1 error ")
+   mytester:assertTensorEq(gradInput[2][1], gradInput2[2][1], 0.000001, "MultiCriterion backward table 2 error ")
+   mytester:assertTensorEq(gradInput[2][2], gradInput2[2][2], 0.000001, "MultiCriterion backward table 3 error ")
+end
+
+function nntest.WeightedMSECriterion()
+   local input = torch.rand(10)
+   local target = input:clone():add(torch.rand(10))
+   local cri = nn.WeightedMSECriterion(torch.rand(10))
+   criterionJacobianTest(cri, input, target)
+end
+
+function nntest.BCECriterion()
+   local eps = 1e-2
+   local input = torch.rand(10)*(1-eps) + eps/2
+   local target = torch.rand(10)*(1-eps) + eps/2
+   local cri = nn.BCECriterion()
+   criterionJacobianTest(cri, input, target)
+   --with weights
+   local weights= torch.rand(10)*(1-eps) + eps/2
+   local cri = nn.BCECriterion(weights)
+   criterionJacobianTest(cri, input, target)
+   -- with weights + batch
+   local bsz = 5
+   local input = torch.rand(bsz, 10)*(1-eps) + eps/2
+   local target = torch.rand(bsz, 10)*(1-eps) + eps/2
+   criterionJacobianTest(cri, input, target)
+end
+
+function nntest.DistKLDivCriterion()
+   local input = torch.rand(10)
+   local target = input:clone():add(torch.rand(10))
+   local cri = nn.DistKLDivCriterion(true)  -- sizeAverage = true
+   criterionJacobianTest(cri, input, target)
+   cri = nn.DistKLDivCriterion(false)  -- sizeAverage = false
+   criterionJacobianTest(cri, input, target)
+end
+
+function nntest.ClassNLLCriterion()
+   local batchsize = math.random(2,4)
+   local numLabels = math.random(5,10)
+
+   local function testclassnll(input, target)
+      -- default ClassNLLCriterion
+      local cri = nn.ClassNLLCriterion()
+      criterionJacobianTest(cri, input, target)
+
+      -- ClassNLLCriterion with weights
+      local weights = torch.rand(numLabels)
+      weights = weights / weights:sum()
+      cri = nn.ClassNLLCriterion(weights)
+      criterionJacobianTest(cri, input, target)
+   end
+
+   -- input/target: 1D/number
+   testclassnll(torch.rand(numLabels), math.random(1,numLabels))
+   -- input/target: 1D/1D
+   testclassnll(torch.rand(numLabels), torch.LongTensor(1):random(1, numLabels))
+   -- input/target: 2D/1D
+   testclassnll(torch.rand(batchsize, numLabels), torch.LongTensor(batchsize):random(1,numLabels))
+   -- test ignoreIndex
+   local ignoreIndex = -1
+   local cri = nn.ClassNLLCriterion(nil, nil, ignoreIndex)
+   local input = torch.randn(numLabels)
+   local target = ignoreIndex
+   mytester:assert(cri:forward(input, target) == 0)
+   mytester:assert(cri:backward(input, target):abs():sum() == 0)
+   local input = torch.randn(batchsize, numLabels)
+   local target = torch.LongTensor(batchsize):random(1,numLabels)
+   target[1] = ignoreIndex
+   local output = cri:forward(input, target)
+   local gradInput = cri:backward(input, target):clone()
+   mytester:assert(gradInput[1]:abs():sum() == 0)
+   local input, target = input:sub(2,batchsize), target:sub(2,batchsize)
+   local output2 = cri:forward(input, target)
+   mytester:assert(math.abs(output2 - output) < 0.0000001)
+   local gradInput2 = cri:backward(input, target)
+   mytester:assertTensorEq(gradInput2, gradInput:sub(2,batchsize), 0.0000001)
+end
+
+function nntest.SpatialClassNLLCriterion()
+   local numLabels = math.random(5,10)
+   local h = math.random(5, 20)
+   local w = math.random(5, 20)
+   local batchSize = math.random(1, 4)
+   local input = torch.rand(batchSize, numLabels, h, w)
+   local target = torch.Tensor(batchSize, h, w)
+   target:apply(function() return math.random(1, numLabels) end)
+
+   -- default ClassNLLCriterion
+   local cri = nn.SpatialClassNLLCriterion()
+   criterionJacobianTest(cri, input, target)
+
+   -- ClassNLLCriterion with weights
+   local weights = torch.rand(numLabels)
+   cri = nn.SpatialClassNLLCriterion(weights)
+   criterionJacobianTest(cri, input, target)
+
+   -- check with ClassNLLCriterion
+   local spatial = nn.SpatialClassNLLCriterion(weights)
+   local regular = nn.ClassNLLCriterion(weights)
+   local spatial_out = spatial:forward(input, target)
+   local regular_out = regular:forward(input:permute(1, 3, 4, 2):contiguous():view(-1, numLabels),
+                                       target:view(-1))
+   mytester:eq(spatial_out, regular_out, 1e-6,
+         "spatial and regular criterions give different results")
+end
+
+function nntest.MultiLabelSoftMarginCriterion()
+   -- test w/o weights
+
+   local cri = nn.MultiLabelSoftMarginCriterion()
+
+   -- stochastic
+   local numLabels = math.random(5, 10)
+   local input = torch.randn(numLabels)
+   local target = torch.round(torch.rand(numLabels))
+   criterionJacobianTest(cri, input, target)
+
+   -- batch
+   local numLabels = math.random(5, 10)
+   local bsz = math.random(3, 7)
+   local input = torch.randn(bsz, numLabels)
+   local target = torch.round(torch.rand(bsz, numLabels))
+   criterionJacobianTest(cri, input, target)
+
+   -- test weights
+
+   local numLabels = math.random(5, 10)
+   local weights = torch.randn(numLabels)
+   local cri = nn.MultiLabelSoftMarginCriterion(weights)
+
+   -- stochastic
+   local input = torch.randn(numLabels)
+   local target = torch.round(torch.rand(numLabels))
+   criterionJacobianTest(cri, input, target)
+
+   -- batch
+   local bsz = math.random(3, 7)
+   local input = torch.randn(bsz, numLabels)
+   local target = torch.round(torch.rand(bsz, numLabels))
+   criterionJacobianTest(cri, input, target)
+end
+
+function nntest.CrossEntropyCriterion()
+   -- stochastic
+   local numLabels = math.random(5, 10)
+   local input = torch.zeros(numLabels)
+   local target = torch.random(1, numLabels)
+
+   local cri = nn.CrossEntropyCriterion()
+   criterionJacobianTest(cri, input, target)
+
+   -- batch
+   local numLabels = math.random(5,10)
+   local bsz = math.random(3, 7)
+   local input = torch.zeros(bsz, numLabels)
+   local target = torch.Tensor(bsz):random(1, numLabels)
+
+   local cri = nn.CrossEntropyCriterion()
+   criterionJacobianTest(cri, input, target)
+
+   -- with weights
+   local weights = torch.rand(numLabels)
+   weights = weights / weights:sum()
+   cri = nn.CrossEntropyCriterion(weights)
+   criterionJacobianTest(cri, input, target)
+
+   -- verify nll.sizeAverage preservation
+   cri = nn.CrossEntropyCriterion(weights)
+   cri.nll.sizeAverage = false
+   criterionJacobianTest(cri, input, target)
+   mytester:eq(cri.nll.sizeAverage, false,
+      "ClassNLLCriterion.sizeAverage overwritten")
+
+   -- verify nll.sizeAverage propagation
+   cri = nn.CrossEntropyCriterion(weights)
+   cri.sizeAverage = false
+   criterionJacobianTest(cri, input, target)
+   mytester:eq(cri.nll.sizeAverage, false,
+      "ClassNLLCriterion.sizeAverage not propagated")
+end
+
+function nntest.LogSigmoid()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ini,inj,ink):zero()
+   local module = nn.LogSigmoid()
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.LogSoftmax()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local input = torch.Tensor(ini,inj):zero()
+   local module = nn.LogSoftMax()
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err, 1e-3, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+
+   -- test logsoftmax when gradOutput is non-contiguous
+   local layer = nn.LogSoftMax()
+   layer:zeroGradParameters()
+   local input = torch.randn(4, 10)
+   local data = torch.randn(4, 20)
+   local gradOutput = data:narrow(2, 1, 10):fill(0)
+   local output = layer:forward(input)
+   local gradInput1 = layer:backward(input, gradOutput):clone()
+   local output = layer:forward(input)
+   gradOutput = gradOutput:clone()
+   local gradInput2 = layer:backward(input, gradOutput):clone()
+
+   mytester:assertlt(gradInput1:add(-1, gradInput2):abs():max(),
+           1e-10,
+           torch.typename(layer)
+         .. ' non-contiguous gradOutput check')
+
+
+
+
+end
+
+function nntest.SpatialLogSoftMax()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local inl = math.random(3,5)
+   local input = torch.Tensor(inl, ink, inj, ini):zero()
+   local module = nn.SpatialLogSoftMax()
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,expprecision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+end
+
+-- function nntest.TemporalLogSoftmax()
+--    local ini = math.random(10,20)
+--    local inj = math.random(10,20)
+--    local input = torch.Tensor(ini,inj):zero()
+--    local module = nn.TemporalLogSoftMax()
+
+--    local err = jac.testJacobian(module,input)
+--    mytester:assertlt(err,precision, 'error on state ')
+
+--    local ferr,berr = jac.testIO(module,input)
+--    mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+--    mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+-- end
+
+function nntest.Max()
+   -- 1D
+   local ini = math.random(3,7)
+   local input = torch.Tensor(ini):zero()
+   local module = nn.Max(1)
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   -- negative dimension
+   local module = nn.Max(-1)
+   local input = torch.Tensor({1, 2, 3})
+   local expected = torch.Tensor({3})
+   local output = module:forward(input)
+   mytester:assertlt(torch.norm(output-expected), precision, 'error on forward ')
+   -- batch
+   local module = nn.Max(1, 1)
+   local input = torch.Tensor({{1, 2, 3},{4, 5, 6}})
+   local expected = torch.Tensor({3, 6})
+   local output = module:forward(input)
+   mytester:assertlt(torch.norm(output-expected), precision, 'error on forward ')
+
+   -- 3D
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ini,inj*ink):zero()
+   local module = nn.Max(1)
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.Min()
+   -- 1D
+   local ini = math.random(3,7)
+   local input = torch.Tensor(ini):zero()
+   local module = nn.Min(1)
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   -- negative dimension
+   local module = nn.Min(-1)
+   local input = torch.Tensor({1, 2, 3})
+   local expected = torch.Tensor({1})
+   local output = module:forward(input)
+   mytester:assertlt(torch.norm(output-expected), precision, 'error on forward ')
+   -- batch
+   local module = nn.Min(1, 1)
+   local input = torch.Tensor({{1, 2, 3},{4, 5, 6}})
+   local expected = torch.Tensor({1, 4})
+   local output = module:forward(input)
+   mytester:assertlt(torch.norm(output-expected), precision, 'error on forward ')
+
+   -- 3D
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ini,inj*ink):zero()
+   local module = nn.Min(1)
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.Mean()
+   -- 1D
+   local ini = math.random(3,7)
+   local input = torch.Tensor(ini):zero()
+   local module = nn.Mean(1)
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   -- negative dimension
+   local module = nn.Mean(-1)
+   local input = torch.Tensor({1, 2, 3})
+   local expected = torch.Tensor({2})
+   local output = module:forward(input)
+   mytester:assertlt(torch.norm(output-expected), precision, 'error on forward ')
+   -- batch
+   local module = nn.Mean(1, 1)
+   local input = torch.Tensor({{1, 2, 3},{4, 5, 6}})
+   local expected = torch.Tensor({2, 5})
+   local output = module:forward(input)
+   mytester:assertlt(torch.norm(output-expected), precision, 'error on forward ')
+
+   -- 3D
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ini,inj,ink):zero()
+   local module = nn.Mean(torch.random(1,3))
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.Mul()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ini,inj,ink):zero()
+   local module = nn.Mul()
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err,precision, 'error on weight ')
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err,precision, 'error on weight [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.Sigmoid()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ini,inj,ink):zero()
+   local module = nn.Sigmoid()
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.Softmax()
+   local ini = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ink, ini):zero()
+   local module = nn.SoftMax()
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,expprecision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.SpatialSoftMax()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local inl = math.random(3,5)
+   local input = torch.Tensor(inl, ink, inj, ini):zero()
+   local module = nn.SpatialSoftMax()
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,expprecision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.Softmin()
+   local ini = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ink, ini):zero()
+   local module = nn.SoftMin()
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,expprecision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.Softsign()
+   local ini = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ink, ini):zero()
+   local module = nn.SoftSign()
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.SoftPlus()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ini,inj,ink):zero()
+   local module = nn.SoftPlus()
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.SpatialSubtractiveNormalization_2dkernel()
+   local inputSize = math.random(6,9)
+   local kersize = 3
+   local nbfeatures = math.random(3,5)
+   local kernel = torch.Tensor(kersize,kersize):fill(1)
+   local module = nn.SpatialSubtractiveNormalization(nbfeatures,kernel)
+   local input = torch.rand(nbfeatures,inputSize,inputSize/2)
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+
+    -- test batch mode
+   local output = module:forward(input):clone()
+   local gradOutput = output:clone():uniform(0,1)
+   local gradInput = module:backward(input, gradOutput):clone()
+   local batchSize = 4
+   local input2 = torch.rand(batchSize,nbfeatures,inputSize,inputSize/2)
+   input2[2]:copy(input)
+
+   local output2 = module:forward(input2)
+   local gradOutput2 = output2:clone():uniform(0,1)
+   gradOutput2[2]:copy(gradOutput)
+   local gradInput2 = module:backward(input2, gradOutput2)
+
+   mytester:assertTensorEq(output2[2], output, 0.000001, "SpatialSubstractiveNormalization 2d forward batch err")
+   mytester:assertTensorEq(gradOutput2[2], gradOutput, 0.000001, "SpatialSubstractiveNormalization 2d backward batch err")
+
+   local err = jac.testJacobian(module,input2)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input2)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+
+end
+
+function nntest.SpatialSubtractiveNormalization_1dkernel()
+   local inputSize = math.random(6,9)
+   local kersize = 3
+   local nbfeatures = math.random(3,5)
+   local kernel = torch.Tensor(kersize):fill(1)
+   local module = nn.SpatialSubtractiveNormalization(nbfeatures,kernel)
+   local input = torch.rand(nbfeatures,inputSize,inputSize/2)
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+
+    -- test batch mode
+   local output = module:forward(input):clone()
+   local gradOutput = output:clone():uniform(0,1)
+   local gradInput = module:backward(input, gradOutput):clone()
+   local batchSize = 4
+   local input2 = torch.rand(batchSize,nbfeatures,inputSize,inputSize/2)
+   input2[2]:copy(input)
+
+   local output2 = module:forward(input2)
+   local gradOutput2 = output2:clone():uniform(0,1)
+   gradOutput2[2]:copy(gradOutput)
+   local gradInput2 = module:backward(input2, gradOutput2)
+
+   mytester:assertTensorEq(output2[2], output, 0.000001, "SpatialSubstractiveNormalization 1d forward batch err")
+   mytester:assertTensorEq(gradOutput2[2], gradOutput, 0.000001, "SpatialSubstractiveNormalization 1d backward batch err")
+
+   local err = jac.testJacobian(module,input2)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input2)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.SpatialDivisiveNormalization_2dkernel()
+   local inputSize = math.random(6,9)
+   local kersize = 3
+   local nbfeatures = math.random(3,5)
+   local kernel = torch.Tensor(kersize,kersize):fill(1)
+   local module = nn.SpatialDivisiveNormalization(nbfeatures,kernel)
+   local input = torch.rand(nbfeatures,inputSize,inputSize/2)
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+
+   -- test batch mode
+   local output = module:forward(input):clone()
+   local gradOutput = output:clone():uniform(0,1)
+   local gradInput = module:backward(input, gradOutput):clone()
+   local batchSize = 4
+   local input2 = torch.rand(batchSize,nbfeatures,inputSize,inputSize/2)
+   input2[2]:copy(input)
+
+   local output2 = module:forward(input2)
+   local gradOutput2 = output2:clone():uniform(0,1)
+   gradOutput2[2]:copy(gradOutput)
+   local gradInput2 = module:backward(input2, gradOutput2)
+
+   mytester:assertTensorEq(output2[2], output, 0.000001, "SpatialDivisiveNormalization 2d forward batch err")
+   mytester:assertTensorEq(gradOutput2[2], gradOutput, 0.000001, "SpatialDivisiveNormalization 2d backward batch err")
+
+   local err = jac.testJacobian(module,input2)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input2)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.SpatialDivisiveNormalization_1dkernel()
+   local inputSize = math.random(6,9)
+   local kersize = 3
+   local nbfeatures = math.random(3,5)
+   local kernel = torch.Tensor(kersize):fill(1)
+   local module = nn.SpatialDivisiveNormalization(nbfeatures,kernel)
+   local input = torch.rand(nbfeatures,inputSize,inputSize/2)
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+
+    -- test batch mode
+   local output = module:forward(input):clone()
+   local gradOutput = output:clone():uniform(0,1)
+   local gradInput = module:backward(input, gradOutput):clone()
+   local batchSize = 4
+   local input2 = torch.rand(batchSize,nbfeatures,inputSize,inputSize/2)
+   input2[2]:copy(input)
+
+   local output2 = module:forward(input2)
+   local gradOutput2 = output2:clone():uniform(0,1)
+   gradOutput2[2]:copy(gradOutput)
+   local gradInput2 = module:backward(input2, gradOutput2)
+
+   mytester:assertTensorEq(output2[2], output, 0.000001, "SpatialDivisiveNormalization 1d forward batch err")
+   mytester:assertTensorEq(gradOutput2[2], gradOutput, 0.000001, "SpatialDivisiveNormalization 1d backward batch err")
+
+   local err = jac.testJacobian(module,input2)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input2)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.SpatialContrastiveNormalization()
+   local inputSize = math.random(6,9)
+   local kersize = 3
+   local nbfeatures = math.random(3,5)
+   local kernel = torch.Tensor(kersize,kersize):fill(1)
+   local module = nn.SpatialContrastiveNormalization(nbfeatures,kernel)
+   local input = torch.rand(nbfeatures,inputSize,inputSize/2)
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+
+   -- test batch mode and type
+   local output = module:forward(input):clone()
+   local gradOutput = output:clone():uniform(0,1)
+   local gradInput = module:backward(input, gradOutput):clone()
+   local batchSize = 4
+   local input2 = torch.rand(batchSize,nbfeatures,inputSize,inputSize/2):float()
+   input2[2]:copy(input)
+
+   module:float() -- type-cast
+   local output2 = module:forward(input2)
+   local gradOutput2 = output2:clone():uniform(0,1)
+   gradOutput2[2]:copy(gradOutput)
+   local gradInput2 = module:backward(input2, gradOutput2)
+
+   mytester:assertTensorEq(output2[2], output:float(), 0.000002, "SpatialContrastiveNormalization 2d forward batch err")
+   mytester:assertTensorEq(gradOutput2[2], gradOutput:float(), 0.000002, "SpatialContrastiveNormalization 2d backward batch err")
+
+   module:double()
+   input2 = input2:double()
+   local err = jac.testJacobian(module,input2)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input2)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.SpatialCrossMapLRN()
+   local inputSize = math.random(6,9)
+   local size = math.random(1,3)*2+1
+   local nbfeatures = math.random(3,8)
+   local alpha = math.random(1,100)/100
+   local beta  = math.random(0,100)/100
+   local k = math.random(1,3)
+   local module = nn.SpatialCrossMapLRN(size, alpha, beta, k)
+   local input = torch.rand(nbfeatures,inputSize,inputSize)
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+
+   -- test batch mode and type
+   local output = module:forward(input):clone()
+   local gradOutput = output:clone():uniform(0,1)
+   local gradInput = module:backward(input, gradOutput):clone()
+   local batchSize = 4
+   local input2 = torch.rand(batchSize,nbfeatures,inputSize,inputSize):float()
+   input2[2]:copy(input)
+
+   module:float() -- type-cast
+   local output2 = module:forward(input2)
+   local gradOutput2 = output2:clone():uniform(0,1)
+   gradOutput2[2]:copy(gradOutput)
+   local gradInput2 = module:backward(input2, gradOutput2)
+
+   mytester:assertTensorEq(output2[2], output:float(), 0.000001, "SpatialCrossMapLRN 2d forward batch err")
+   mytester:assertTensorEq(gradOutput2[2], gradOutput:float(), 0.000001, "SpatialCrossMapLRN 2d backward batch err")
+
+   module:double()
+   input2 = input2:double()
+   local err = jac.testJacobian(module,input2)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input2)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+
+function nntest.SpatialConvolution()
+   local from = math.random(1,5)
+   local to = math.random(1,5)
+   local ki = math.random(1,5)
+   local kj = math.random(1,5)
+   local si = math.random(1,4)
+   local sj = math.random(1,4)
+   local outi = math.random(5,7)
+   local outj = math.random(5,7)
+   local ini = (outi-1)*si+ki
+   local inj = (outj-1)*sj+kj
+   local module = nn.SpatialConvolution(from, to, ki, kj, si, sj)
+   local input = torch.Tensor(from, inj, ini):zero()
+
+   local function jacTests(module)
+      -- stochastic
+
+      local err = jac.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'error on state ')
+
+      local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+      mytester:assertlt(err , precision, 'error on weight ')
+
+      if module.bias then
+         local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+         mytester:assertlt(err , precision, 'error on bias ')
+      end
+
+      local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+      mytester:assertlt(err , precision, 'error on weight [direct update] ')
+
+      if module.bias then
+         local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+         mytester:assertlt(err , precision, 'error on bias [direct update] ')
+      end
+
+      nn.hessian.enable()
+
+      local err = jac.testDiagHessianInput(module, input)
+      mytester:assertlt(err , precision, 'error on diagHessianInput')
+
+      local err = jac.testDiagHessianWeight(module, input)
+      mytester:assertlt(err , precision, 'error on diagHessianWeight')
+
+      if module.bias then
+         local err = jac.testDiagHessianBias(module, input)
+         mytester:assertlt(err , precision, 'error on diag HessianBias')
+      end
+
+      for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+         mytester:assertlt(err, precision, string.format(
+                              'error on weight [%s]', t))
+      end
+
+      if module.bias then
+         for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+            mytester:assertlt(err, precision, string.format(
+                                 'error on bias [%s]', t))
+         end
+      end
+
+      -- batch
+
+      --verbose = true
+      local batch = math.random(2,5)
+      outi = math.random(4,8)
+      outj = math.random(4,8)
+      ini = (outi-1)*si+ki
+      inj = (outj-1)*sj+kj
+      module = nn.SpatialConvolution(from, to, ki, kj, si, sj)
+      input = torch.Tensor(batch,from,inj,ini):zero()
+
+      local err = jac.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'batch error on state ')
+
+      local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+      mytester:assertlt(err , precision, 'batch error on weight ')
+
+      if module.bias then
+         local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+         mytester:assertlt(err , precision, 'batch error on bias ')
+      end
+
+      local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+      mytester:assertlt(err , precision, 'batch error on weight [direct update] ')
+
+      if module.bias then
+         local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+         mytester:assertlt(err , precision, 'batch error on bias [direct update] ')
+      end
+
+      local err = jac.testDiagHessianInput(module, input)
+      mytester:assertlt(err , precision, 'error on diagHessianInput')
+
+      local err = jac.testDiagHessianWeight(module, input)
+      mytester:assertlt(err , precision, 'error on diagHessianWeight')
+
+      if module.bias then
+         local err = jac.testDiagHessianBias(module, input)
+         mytester:assertlt(err , precision, 'error on diag HessianBias')
+      end
+
+      for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+         mytester:assertlt(err, precision, string.format(
+                              'error on weight [%s]', t))
+      end
+
+      if module.bias then
+         for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+            mytester:assertlt(err, precision, string.format(
+                                 'batch error on bias [%s]', t))
+         end
+      end
+
+      local ferr, berr = jac.testIO(module, input)
+      mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+      mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
+   end
+
+   jacTests(module)
+   module:noBias()
+   jacTests(module)
+   module.bias = torch.Tensor(module.nOutputPlane):zero()
+   module.gradBias = torch.Tensor(module.nOutputPlane):zero()
+   module:reset()
+   jacTests(module)
+
+   local output = module:forward(input):clone()
+   local gradOutput = output:clone():normal()
+   local gradInput = module:forward(input, gradOutput):clone()
+   local bigWeight = module.weight.new(module.weight:nElement() * 4):fill(0/0) -- fill with nans
+   local newWeight = bigWeight:narrow(1, module.weight:nElement() * 3, module.weight:nElement())
+   newWeight = newWeight:viewAs(module.weight):copy(module.weight)
+   module.weight = newWeight
+   local newOutput = module:forward(input)
+   local newGradInput = module:forward(input, gradOutput)
+   mytester:asserteq((newOutput - output):abs():max(), 0,
+      torch.typename(module) .. ' forward failure case in a getParameters setting ')
+   mytester:asserteq((newGradInput - gradInput):abs():max(), 0,
+      torch.typename(module) .. ' backward failure case in a getParameters setting ')
+
+end
+
+function nntest.SpatialConvolutionMM()
+   local from = math.random(2,5)
+   local to = math.random(1,5)
+   local ki = math.random(1,5)
+   local kj = math.random(1,5)
+   local di =  math.random(1,4)
+   local dj =  math.random(1,4)
+   local padW = math.random(0,2)
+   local padH = math.random(0,2)
+   local outi = math.random(5,9)
+   local outj = math.random(5,9)
+   local ini = (outi-1)*di+ki-padW*2
+   local inj = (outj-1)*dj+kj-padH*2
+   local module = nn.SpatialConvolutionMM(from, to, ki, kj, di, dj, padW, padH)
+   local input = torch.Tensor(from, inj, ini):zero()
+
+   -- stochastic
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'error on bias [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on bias [%s]', t))
+   end
+
+   -- batch
+
+   --verbose = true
+   local batch = math.random(2,5)
+
+   module = nn.SpatialConvolutionMM(from, to, ki, kj, di, dj, padW, padH)
+   input = torch.Tensor(batch,from,inj,ini):zero()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'batch error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'batch error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'batch error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'batch error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'batch error on bias [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'batch error on bias [%s]', t))
+   end
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
+
+   -- non-contiguous
+   local input = torch.randn(batch,from,ini,inj):transpose(3,4) -- non-contiguous
+   local inputc = input:contiguous() -- contiguous
+   local output = module:forward(input):clone()
+   local outputc = module:forward(inputc):clone()
+   mytester:asserteq(0, (output-outputc):abs():max(), torch.typename(module) .. ' - contiguous err ')
+   local gradInput = module:backward(input, output):clone()
+   local gradInputc = module:backward(inputc, outputc):clone()
+   mytester:asserteq(0, (gradInput-gradInputc):abs():max(), torch.typename(module) .. ' - contiguous err ')
+end
+
+function nntest.SpatialConvolutionLocal()
+   local from = math.random(1,4)
+   local to = math.random(1,4)
+   local ki = math.random(1,3)
+   local kj = math.random(1,3)
+   local si = math.random(1,3)
+   local sj = math.random(1,3)
+   local outi = math.random(5,6)
+   local outj = math.random(5,6)
+   local ini = (outi-1)*si+ki
+   local inj = (outj-1)*sj+kj
+   local module = nn.SpatialConvolutionLocal(from, to, ini, inj, ki, kj, si, sj)
+   local input = torch.Tensor(from, inj, ini):zero()
+
+   -- stochastic
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'error on bias [direct update] ')
+
+   nn.hessian.enable()
+
+   local err = jac.testDiagHessianInput(module, input)
+   mytester:assertlt(err , precision, 'error on diagHessianInput')
+
+   local err = jac.testDiagHessianWeight(module, input)
+   mytester:assertlt(err , precision, 'error on diagHessianWeight')
+
+   local err = jac.testDiagHessianBias(module, input)
+   mytester:assertlt(err , precision, 'error on diag HessianBias')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on bias [%s]', t))
+   end
+
+   -- batch
+
+   --verbose = true
+   local batch = math.random(2,5)
+   outi = math.random(4,6)
+   outj = math.random(4,6)
+   ini = (outi-1)*si+ki
+   inj = (outj-1)*sj+kj
+   module = nn.SpatialConvolutionLocal(from, to, ini, inj, ki, kj, si, sj)
+   input = torch.Tensor(batch, from, inj, ini):zero()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'batch error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'batch error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'batch error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'batch error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'batch error on bias [direct update] ')
+
+   local err = jac.testDiagHessianInput(module, input)
+   mytester:assertlt(err , precision, 'error on diagHessianInput')
+
+   local err = jac.testDiagHessianWeight(module, input)
+   mytester:assertlt(err , precision, 'error on diagHessianWeight')
+
+   local err = jac.testDiagHessianBias(module, input)
+   mytester:assertlt(err , precision, 'error on diag HessianBias')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'batch error on bias [%s]', t))
+   end
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
+
+   -- check against nn.SpatialConvolution
+   local conv = nn.SpatialConvolution(from, to, ki, kj, si, sj)
+   torch.repeatTensor(module.bias, conv.bias:view(to, 1, 1), 1, outj, outi)
+   torch.repeatTensor(module.weight, conv.weight:view(1, 1, from, to, ki, kj), outi, outj, 1, 1, 1, 1)
+   local input = torch.rand(batch, from, inj, ini)
+   local output = module:forward(input)
+   local outputConv = conv:forward(input)
+   local err = torch.dist(output, outputConv)
+   mytester:assertlt(err, precision, 'error checking against nn.SpatialConvolution')
+end
+
+function nntest.SpatialFullConvolution()
+   local from = math.random(2,5)
+   local to = math.random(1,5)
+   local ki = math.random(1,5)
+   local kj = math.random(1,5)
+   local di =  math.random(1,4)
+   local dj =  math.random(1,4)
+   local padW = math.random(0,2)
+   local padH = math.random(0,2)
+   local outi = math.random(5,9)
+   local outj = math.random(5,9)
+   local adjW = (outi + padW*2 - ki) % di
+   local adjH = (outj + padH*2 - kj) % dj
+   local ini = math.floor((outi + padW*2 - ki)/di + 1)
+   local inj = math.floor((outj + padH*2 - kj)/dj + 1)
+   local module = nn.SpatialFullConvolution(from, to, ki, kj, di, dj, padW, padH, adjW, adjH)
+   local input = torch.Tensor(from, inj, ini):zero()
+
+   local function jacTests(module)
+      -- stochastic
+
+      local err = jac.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'error on state ')
+
+      local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+      mytester:assertlt(err , precision, 'error on weight ')
+
+      if module.bias then
+         local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+         mytester:assertlt(err , precision, 'error on bias ')
+      end
+
+      local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+      mytester:assertlt(err , precision, 'error on weight [direct update] ')
+
+      if module.bias then
+         local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+         mytester:assertlt(err , precision, 'error on bias [direct update] ')
+      end
+
+      for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+         mytester:assertlt(err, precision, string.format(
+                            'error on weight [%s]', t))
+      end
+
+      if module.bias then
+         for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+            mytester:assertlt(err, precision, string.format(
+                               'error on bias [%s]', t))
+         end
+      end
+
+      -- batch
+
+      --verbose = true
+      local batch = math.random(2,5)
+
+      module = nn.SpatialFullConvolution(from, to, ki, kj, di, dj, padW, padH, adjW, adjH)
+      input = torch.Tensor(batch,from,inj,ini):zero()
+
+      -- Check that the required output size matches the actual output size
+      local output = module:forward(input)
+      mytester:asserteq(output:size(3), outj, 'output height error')
+      mytester:asserteq(output:size(4), outi, 'output width error')
+
+      local err = jac.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'batch error on state ')
+
+      local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+      mytester:assertlt(err , precision, 'batch error on weight ')
+
+      if module.bias then
+         local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+         mytester:assertlt(err , precision, 'batch error on bias ')
+      end
+
+      local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+      mytester:assertlt(err , precision, 'batch error on weight [direct update] ')
+
+      if module.bias then
+         local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+         mytester:assertlt(err , precision, 'batch error on bias [direct update] ')
+      end
+
+      for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+         mytester:assertlt(err, precision, string.format(
+                            'error on weight [%s]', t))
+      end
+
+      if module.bias then
+         for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+            mytester:assertlt(err, precision, string.format(
+                               'batch error on bias [%s]', t))
+         end
+      end
+
+      local ferr, berr = jac.testIO(module, input)
+      mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+      mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
+   end
+
+   jacTests(module)
+   module:noBias()
+   jacTests(module)
+   module.bias = torch.Tensor(module.nOutputPlane):zero()
+   module.gradBias = torch.Tensor(module.nOutputPlane):zero()
+   module:reset()
+   jacTests(module)
+
+   -- non-contiguous
+   local batch = math.random(2,5)
+   local input = torch.randn(batch,from,ini,inj):transpose(3,4) -- non-contiguous
+   local inputc = input:contiguous() -- contiguous
+   local output = module:forward(input)
+   local outputc = module:forward(inputc)
+   mytester:asserteq(0, (output-outputc):abs():max(), torch.typename(module) .. ' - contiguous err ')
+   local gradInput = module:backward(input, output)
+   local gradInputc = module:backward(inputc, outputc)
+   mytester:asserteq(0, (gradInput-gradInputc):abs():max(), torch.typename(module) .. ' - contiguous err ')
+end
+
+function nntest.SpatialFullConvolutionDualInput()
+   local from = math.random(2,5)
+   local to = math.random(1,5)
+   local ki = math.random(1,5)
+   local kj = math.random(1,5)
+   local di =  math.random(1,4)
+   local dj =  math.random(1,4)
+   local padW = math.random(0,2)
+   local padH = math.random(0,2)
+   local outi = math.random(5,9)
+   local outj = math.random(5,9)
+   local ini = math.floor((outi + padW*2 - ki)/di + 1)
+   local inj = math.floor((outj + padH*2 - kj)/dj + 1)
+   local adjW = (outi + 2 * padW - ki) % di
+   local adjH = (outj + 2 * padH - kj) % dj
+   local targetTensor = torch.Tensor(outj, outi):zero()
+   local input = torch.Tensor(from, inj, ini):zero()
+
+   local module = nn.SpatialFullConvolution(from, to, ki, kj, di, dj, padW, padH)
+   local moduleRef = nn.SpatialFullConvolution(from, to, ki, kj, di, dj, padW, padH, adjW, adjH)
+   moduleRef.weight:copy(module.weight)
+   moduleRef.bias:copy(module.bias)
+
+   -- Check that the required output size matches the actual output size
+   -- when using the dual input mode
+   local output = module:forward({input, targetTensor})
+   mytester:asserteq(output:size(2), outj, 'output height error')
+   mytester:asserteq(output:size(3), outi, 'output width error')
+
+   -- Check that backward and forward match the reference module
+   local outputRef = moduleRef:forward(input)
+   mytester:asserteq(0, (output-outputRef):abs():max(), torch.typename(module) .. ' - output err ')
+   local gradOutput = outputRef:clone():uniform()
+   local gradInputRef = moduleRef:backward(input, gradOutput)
+   local gradInput = module:backward({input, targetTensor}, gradOutput)
+   mytester:asserteq(0, (gradInput[1]-gradInputRef):abs():max(), torch.typename(module) .. ' - gradInput[1] err ')
+
+   -- Check that gradInput[2] is the singleton tensor {0}
+   mytester:asserteq(gradInput[2]:storage():size(), 1, torch.typename(module) .. ' - gradInput[2] size err ')
+   mytester:asserteq(gradInput[2]:storage()[1], 0, torch.typename(module) .. ' - gradInput[2] value err ')
+end
+
+function nntest.SpatialDilatedConvolution()
+   local from = math.random(1,5)
+   local to = math.random(1,5)
+   local ki = math.random(1,5)
+   local kj = math.random(1,5)
+   local di =  math.random(1,4)
+   local dj =  math.random(1,4)
+   local padW = math.random(0,2)
+   local padH = math.random(0,2)
+   local outi = math.random(5,9)
+   local outj = math.random(5,9)
+   local dilationW = math.random(1,10)
+   local dilationH = math.random(1,10)
+   local ini = (outi - 1) * di - 2 * padW + dilationW * (ki-1) + 1
+   local inj = (outj - 1) * dj - 2 * padH + dilationH * (kj-1) + 1
+
+   local module = nn.SpatialDilatedConvolution(from, to, ki, kj, di, dj, padW, padH, dilationW, dilationH)
+   local input = torch.Tensor(from, inj, ini):zero()
+
+   -- stochastic
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'error on bias [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on bias [%s]', t))
+   end
+
+   -- batch
+
+   --verbose = true
+   local batch = math.random(2,5)
+
+   module = nn.SpatialDilatedConvolution(from, to, ki, kj, di, dj, padW, padH, dilationW, dilationH)
+   input = torch.Tensor(batch,from,inj,ini):zero()
+
+   -- Check that the required output size matches the actual output size
+   local output = module:forward(input)
+   mytester:asserteq(output:size(3), outj, 'output height error')
+   mytester:asserteq(output:size(4), outi, 'output width error')
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'batch error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'batch error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'batch error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'batch error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'batch error on bias [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'batch error on bias [%s]', t))
+   end
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
+
+   -- non-contiguous
+   local input = torch.randn(batch,from,ini,inj):transpose(3,4) -- non-contiguous
+   local inputc = input:contiguous() -- contiguous
+   local output = module:forward(input)
+   local outputc = module:forward(inputc)
+   mytester:asserteq(0, (output-outputc):abs():max(), torch.typename(module) .. ' - contiguous err ')
+   local gradInput = module:backward(input, output)
+   local gradInputc = module:backward(inputc, outputc)
+   mytester:asserteq(0, (gradInput-gradInputc):abs():max(), torch.typename(module) .. ' - contiguous err ')
+end
+
+function nntest.SpatialConvolutionMap()
+   local from = math.random(1,5)
+   local fanin = math.random(1, from)
+   local to = math.random(1,5)
+   local ki = math.random(1,5)
+   local kj = math.random(1,5)
+   local si = math.random(1,3)
+   local sj = math.random(1,3)
+   local outi = math.random(5,9)
+   local outj = math.random(5,9)
+   local ini = (outi-1)*si+ki
+   local inj = (outj-1)*sj+kj
+
+   local module = nn.SpatialConvolutionMap(nn.tables.random(from, to, fanin), ki, kj, si, sj)
+   local input = torch.Tensor(from, inj, ini):zero()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'error on bias ')
+
+   nn.hessian.enable()
+
+   local err = jac.testDiagHessianInput(module, input)
+   mytester:assertlt(err , precision, 'error on diagHessianInput')
+
+   local err = jac.testDiagHessianWeight(module, input)
+   mytester:assertlt(err , precision, 'error on diagHessianWeight')
+
+   local err = jac.testDiagHessianBias(module, input)
+   mytester:assertlt(err , precision, 'error on diag HessianBias')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on bias [%s]', t))
+   end
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
+
+
+
+    -- batch
+
+   --verbose = true
+   local batch = math.random(2,6)
+   module = nn.SpatialConvolutionMap(nn.tables.random(from, to, fanin), ki, kj, si, sj)
+   input = torch.Tensor(batch,from,inj,ini):zero()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'batch error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'batch error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'batch error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'batch error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'batch error on bias [direct update] ')
+
+   local err = jac.testDiagHessianInput(module, input)
+   mytester:assertlt(err , precision, 'error on diagHessianInput')
+
+   local err = jac.testDiagHessianWeight(module, input)
+   mytester:assertlt(err , precision, 'error on diagHessianWeight')
+
+   local err = jac.testDiagHessianBias(module, input)
+   mytester:assertlt(err , precision, 'error on diag HessianBias')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'batch error on bias [%s]', t))
+   end
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.SpatialFullConvolutionMap()
+   local from = math.random(2,4)
+   local to = math.random(2,5)
+   local fanin = math.random(1, from)
+   local tt = nn.tables.random(from, to, fanin)
+   local ki = math.random(2,5)
+   local kj = math.random(2,5)
+   local si = math.random(1,3)
+   local sj = math.random(1,3)
+   local ini = math.random(5,7)
+   local inj = math.random(5,7)
+   local module = nn.SpatialFullConvolutionMap(tt, ki, kj, si, sj)
+   local input = torch.Tensor(from, inj, ini):zero()
+
+   -- stochastic
+      local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'error on bias [direct update] ')
+
+   nn.hessian.enable()
+
+   local err = jac.testDiagHessianInput(module, input)
+   mytester:assertlt(err , precision, 'error on diagHessianInput')
+
+   local err = jac.testDiagHessianWeight(module, input)
+   mytester:assertlt(err , precision, 'error on diagHessianWeight')
+
+   local err = jac.testDiagHessianBias(module, input)
+   mytester:assertlt(err , precision, 'error on diag HessianBias')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on bias [%s]', t))
+   end
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.SpatialFullConvolutionCompare()
+    local from = math.random(2,4)
+    local to = math.random(2,5)
+    local tt = nn.tables.full(from, to)
+    local ki = math.random(2,5)
+    local kj = math.random(2,5)
+    local si = math.random(1,3)
+    local sj = math.random(1,3)
+    local ini = math.random(7,8)
+    local inj = math.random(7,8)
+    local module1 = nn.SpatialFullConvolutionMap(tt, ki, kj, si, sj)
+    local module2 = nn.SpatialFullConvolution(from, to, ki, kj, si, sj)
+    local input = torch.rand(from, inj, ini)
+    for k=1,tt:size(1) do
+       module1.weight[k]:copy(module2.weight[tt[k][1]][tt[k][2]])
+       module1.bias:copy(module2.bias)
+    end
+
+    local o1 = module1:updateOutput(input)
+    local o2 = module2:updateOutput(input)
+    mytester:assertlt(o1:dist(o2), precision, 'error on output')
+
+    local go1 = torch.rand(o1:size())
+    local go2 = go1:clone()
+
+    local gi1= module1:updateGradInput(input,go1)
+    local gi2 = module2:updateGradInput(input,go2)
+    mytester:assertlt(gi1:dist(gi2), precision, 'error on gradInput')
+
+    module1:zeroGradParameters()
+    module2:zeroGradParameters()
+
+    module1:accGradParameters(input,go1)
+    module2:accGradParameters(input,go2)
+    for k=1,tt:size(1) do
+      mytester:assertlt(module1.gradWeight[k]:dist(module2.gradWeight[tt[k][1]][tt[k][2]]),precision,'error on gradWeight ' .. k)
+    end
+    mytester:assertlt(module1.gradBias:dist(module2.gradBias),precision,'error on gradBias ')
+end
+
+local function batchcompare(smod, sin, plist)
+   local bs = torch.LongStorage(sin:dim()+1)
+   bs[1] = 1
+   for i=1,sin:dim() do bs[i+1] = sin:size()[i] end
+   local bin = torch.Tensor(bs):copy(sin)
+   local bmod = smod:clone()
+
+   local sout = smod:forward(sin):clone()
+   local bout = bmod:forward(bin):clone()
+
+   local sgout = torch.randn(sout:size())
+   local bgout = torch.Tensor(bout:size())
+   bgout:copy(sgout)
+
+   local sgin = smod:backward(sin, sgout)
+   local bgin = bmod:backward(bin, bgout)
+
+   smod:accGradParameters(sin, sgout, 1)
+   bmod:accGradParameters(bin, bgout, 1)
+
+   mytester:assertTensorEq(sout,bout:select(1,1), 1e-8, 'batchcompare error on output')
+   mytester:assertTensorEq(sgin,bgin:select(1,1), 1e-8, 'batchcompare error on gradInput')
+
+   for i,v in pairs(plist) do
+      mytester:assertTensorEq(smod[v],bmod[v], 1e-8, 'batchcompare error on ' .. v)
+   end
+end
+
+function nntest.SpatialConvolutionBatchCompare()
+   local from = math.random(1,5)
+   local to = math.random(1,5)
+   local ki = math.random(1,5)
+   local kj = math.random(1,5)
+   local si = math.random(1,4)
+   local sj = math.random(1,4)
+   local outi = math.random(5,9)
+   local outj = math.random(5,9)
+   local ini = (outi-1)*si+ki
+   local inj = (outj-1)*sj+kj
+
+   local module = nn.SpatialConvolution(from, to, ki, kj, si, sj)
+   module:zeroGradParameters()
+   local input = torch.randn(from,inj,ini)
+
+   batchcompare(module,input, {'weight','bias','gradWeight','gradBias'})
+end
+
+function nntest.SpatialFullConvolutionBatchCompare()
+   local from = math.random(1,5)
+   local to = math.random(1,5)
+   local ki = math.random(1,5)
+   local kj = math.random(1,5)
+   local si = math.random(1,4)
+   local sj = math.random(1,4)
+   local ini = math.random(5,9)
+   local inj = math.random(5,9)
+
+   local module = nn.SpatialFullConvolution(from, to, ki, kj, si, sj)
+   module:zeroGradParameters()
+   local input = torch.randn(from, inj, ini)
+
+   batchcompare(module,input, {'weight','bias','gradWeight','gradBias'})
+end
+
+
+
+function nntest.SpatialSubSamplingBatchCompare()
+   local from = math.random(1,6)
+   local ki = math.random(1,5)
+   local kj = math.random(1,5)
+   local si = math.random(1,4)
+   local sj = math.random(1,4)
+   local outi = math.random(6,10)
+   local outj = math.random(6,10)
+   local ini = (outi-1)*si+ki
+   local inj = (outj-1)*sj+kj
+   local module = nn.SpatialSubSampling(from, ki, kj, si, sj)
+   module:zeroGradParameters()
+   local input = torch.randn(from,inj,ini)--torch.Tensor(from, inj, ini):zero()
+
+   batchcompare(module,input, {'weight','bias','gradWeight','gradBias'})
+end
+
+function nntest.SpatialSubSampling()
+   local from = math.random(1,6)
+   local ki = math.random(1,5)
+   local kj = math.random(1,5)
+   local si = math.random(1,4)
+   local sj = math.random(1,4)
+   local outi = math.random(6,10)
+   local outj = math.random(6,10)
+   local ini = (outi-1)*si+ki
+   local inj = (outj-1)*sj+kj
+   local module = nn.SpatialSubSampling(from, ki, kj, si, sj)
+   local input = torch.Tensor(from, inj, ini):zero()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'error on bias [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on bias [%s]', t))
+   end
+
+   local batch = math.random(2,5)
+   outi = math.random(4,8)
+   outj = math.random(4,8)
+   ini = (outi-1)*si+ki
+   inj = (outj-1)*sj+kj
+   module = nn.SpatialSubSampling(from, ki, kj, si, sj)
+   input = torch.Tensor(batch,from,inj,ini):zero()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'batch error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'batch error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'batch error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'batch error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'batch error on bias [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'batch error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'batch error on bias [%s]', t))
+   end
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.SpatialMaxPooling()
+   for _,ceil_mode in pairs({true,false}) do
+      local from = math.random(1,5)
+      local ki = math.random(1,4)
+      local kj = math.random(1,4)
+      local si = math.random(1,3)
+      local sj = math.random(1,3)
+      local outi = math.random(4,5)
+      local outj = math.random(4,5)
+      local padW = math.min(math.random(0,1),math.floor(ki/2))
+      local padH =  math.min(math.random(0,1),math.floor(kj/2))
+      local ini = (outi-1)*si+ki-2*padW
+      local inj = (outj-1)*sj+kj-2*padH
+
+      local ceil_string = ceil_mode and 'ceil' or 'floor'
+      local module = nn.SpatialMaxPooling(ki,kj,si,sj,padW,padH)
+      if ceil_mode then module:ceil() else module:floor() end
+      local input = torch.rand(from,inj,ini)
+
+      local err = jac.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'error '..ceil_string..' mode on state ')
+
+      local ferr, berr = jac.testIO(module, input)
+      mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+      mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+
+      -- batch
+      local nbatch = math.random(2,5)
+      input = torch.rand(nbatch,from,inj,ini)
+      module = nn.SpatialMaxPooling(ki,kj,si,sj,padW,padH)
+      if ceil_mode then module:ceil() else module:floor() end
+
+      local err = jac.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'error '..ceil_string..' mode on state (Batch)')
+
+      local ferr, berr = jac.testIO(module, input)
+      mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ', precision)
+      mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ', precision)
+  end
+end
+
+function nntest.SpatialMaxUnpooling()
+   for _,ceil_mode in pairs({true,false}) do
+      local from = math.random(1,5)
+      local ki = math.random(2,4)
+      local kj = math.random(2,4)
+      local si, sj = ki, kj
+      local outi = math.random(4,5)
+      local outj = math.random(4,5)
+      local padW = math.min(math.random(0,1),math.floor(ki/2))
+      local padH = math.min(math.random(0,1),math.floor(kj/2))
+      local ini = (outi-1)*si+ki-2*padW
+      local inj = (outj-1)*sj+kj-2*padH
+
+      local ceil_string = ceil_mode and 'ceil' or 'floor'
+      local poolingModule = nn.SpatialMaxPooling(ki,kj,si,sj,padW,padH)
+      if ceil_mode then poolingModule:ceil() else poolingModule:floor() end
+      local module = nn.SpatialMaxUnpooling(poolingModule)
+
+      local original = torch.rand(from,inj,ini)
+      local input = poolingModule:forward(original)
+      local output = module:forward(input)
+
+      mytester:assert(output:isSameSizeAs(original),'SpatialMaxUnpooling output size err')
+
+      local err = jac.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'error '..ceil_string..' mode on state ')
+
+      local ferr, berr = jac.testIO(module, input)
+      mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+      mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+
+      -- batch
+      local nbatch = math.random(2,5)
+      original = torch.rand(nbatch,from,inj,ini)
+      input = poolingModule:forward(original)
+      output = module:forward(input)
+
+      mytester:assert(output:isSameSizeAs(original),'SpatialMaxUnpooling batch output size err')
+
+      local err = jac.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'error '..ceil_string..' mode on state (Batch)')
+
+      local ferr, berr = jac.testIO(module, input)
+      mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ', precision)
+      mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ', precision)
+  end
+end
+
+function nntest.SpatialDilatedMaxPooling()
+   for _,ceil_mode in pairs({true,false}) do
+      local from = math.random(1,5)
+      local ki = math.random(1,4)
+      local kj = math.random(1,4)
+      local si = math.random(1,3)
+      local sj = math.random(1,3)
+      local outi = math.random(4,5)
+      local outj = math.random(4,5)
+      local padW = math.min(math.random(0,1),math.floor(ki/2))
+      local padH =  math.min(math.random(0,1),math.floor(kj/2))
+      local dilationW = math.random(1,5)
+      local dilationH = math.random(1,5)
+      local ini = (outi-1)*si+(dilationW*(ki-1)+1)-2*padW
+      local inj = (outj-1)*sj+(dilationH*(kj-1)+1)-2*padH
+
+      local ceil_string = ceil_mode and 'ceil' or 'floor'
+      local module = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padW,padH,dilationW, dilationH)
+      if ceil_mode then module:ceil() else module:floor() end
+      local input = torch.rand(from,inj,ini)
+
+      local err = jac.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'error '..ceil_string..' mode on state ')
+
+      local ferr, berr = jac.testIO(module, input)
+      mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+      mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+
+      -- batch
+      local nbatch = math.random(2,5)
+      input = torch.rand(nbatch,from,inj,ini)
+      module = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padW,padH,dilationW,dilationH)
+      if ceil_mode then module:ceil() else module:floor() end
+
+      local err = jac.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'error '..ceil_string..' mode on state (Batch)')
+
+      local ferr, berr = jac.testIO(module, input)
+      mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ')
+      mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ')
+  end
+end
+
+function nntest.SpatialFractionalMaxPooling()
+    local batch = math.random(1, 3)
+    local plane = math.random(1, 3)
+    local outW = math.random(1, 7)
+    local outH = math.random(1, 7)
+    local poolSizeW = math.random(2, 4)
+    local poolSizeH = math.random(2, 4)
+
+    local minInW = outW + poolSizeW
+    local minInH = outH + poolSizeH
+
+    local inW = math.random(minInW, minInW + 6)
+    local inH = math.random(minInH, minInH + 6)
+
+    -- fix the pooling regions so they aren't regenerated with every
+    -- forward(), so testJacobian can work properly
+    local module =
+        nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH)
+        :fixPoolingRegions()
+    local input = nil
+    if batch == 1 then
+        input = torch.Tensor(plane, inH, inW):zero()
+    else
+        input = torch.Tensor(batch, plane, inH, inW):zero()
+    end
+
+    local err = nn.Jacobian.testJacobian(module, input)
+    mytester:assertlt(err, precision, 'error on state')
+end
+
+function nntest.SpatialFractionalMaxPooling_Ratio()
+    -- Fix a reduction ratio, and test with two different input sizes
+    local reductionRatioW = torch.uniform(0.4, 0.74)
+    local reductionRatioH = torch.uniform(0.4, 0.74)
+
+    for tries = 1, 2 do
+        local batch = math.random(1, 3)
+        local plane = math.random(1, 3)
+        local poolSizeW = math.random(2, 3)
+        local poolSizeH = math.random(2, 3)
+
+        local minInW = math.random(5, 8) + poolSizeW
+        local minInH = math.random(5, 8) + poolSizeH
+
+        local inW = math.random(minInW, minInW + 6)
+        local inH = math.random(minInH, minInH + 6)
+
+        -- fix the pooling regions so they aren't regenerated with every
+        -- forward(), so testJacobian can work properly
+        local module =
+            nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH,
+                                           reductionRatioW, reductionRatioH)
+            :fixPoolingRegions()
+        local input = nil
+        if batch == 1 then
+            input = torch.Tensor(plane, inH, inW):zero()
+        else
+            input = torch.Tensor(batch, plane, inH, inW):zero()
+        end
+
+        -- Make sure that the output size is based on our ratio
+        local output = module:updateOutput(input)
+        if batch == 1 then
+            mytester:asserteq(output:size(3), math.floor(reductionRatioW * inW))
+            mytester:asserteq(output:size(2), math.floor(reductionRatioH * inH))
+        else
+            mytester:asserteq(output:size(4), math.floor(reductionRatioW * inW))
+            mytester:asserteq(output:size(3), math.floor(reductionRatioH * inH))
+        end
+
+        local err = nn.Jacobian.testJacobian(module, input)
+        mytester:assertlt(err, precision, 'error on state')
+    end
+end
+
+function nntest.SpatialAveragePooling()
+   for _,count_include_pad in pairs({true,false}) do
+      for _,ceil_mode in pairs({true,false}) do
+        local from = math.random(1,5)
+        local ki = math.random(1,4)
+        local kj = math.random(1,4)
+        local si = math.random(1,3)
+        local sj = math.random(1,3)
+        local outi = math.random(4,5)
+        local outj = math.random(4,5)
+        local padW = math.min(math.random(0,1),math.floor(ki/2))
+        local padH =  math.min(math.random(0,1),math.floor(kj/2))
+        local ini = (outi-1)*si+ki-2*padW
+        local inj = (outj-1)*sj+kj-2*padH
+
+        local mode_string = ceil_mode and 'ceil' or 'floor'
+
+        local module = nn.SpatialAveragePooling(ki, kj, si, sj, padW, padH)
+        if ceil_mode then module:ceil() else module:floor() end
+        if count_include_pad then
+           module:setCountIncludePad()
+           mode_string = mode_string .. ' - count include padding'
+        else
+           module:setCountExcludePad()
+           mode_string = mode_string .. ' - count exclude padding'
+        end
+        local input = torch.Tensor(from, inj, ini):uniform()
+
+        local err = jac.testJacobian(module, input)
+        mytester:assertlt(err, precision, 'error'..mode_string..' on state ')
+
+        local ferr, berr = jac.testIO(module, input)
+        mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+        mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+
+        -- batch
+        local batch = math.random(2,5)
+        outi = math.random(4,5)
+        outj = math.random(4,5)
+        local padW = math.min(math.random(0,1),math.floor(ki/2))
+        local padH =  math.min(math.random(0,1),math.floor(kj/2))
+        local ini = (outi-1)*si+ki-2*padW
+        local inj = (outj-1)*sj+kj-2*padH
+
+        module = nn.SpatialAveragePooling(ki, kj, si, sj, padW, padH)
+        if ceil_mode then module:ceil() else module:floor() end
+        if count_include_pad then
+           module:setCountIncludePad()
+        else
+           module:setCountExcludePad()
+        end
+        input = torch.Tensor(batch,from,inj,ini):uniform()
+
+        local err = jac.testJacobian(module, input)
+        mytester:assertlt(err, precision, 'batch error'..mode_string..' on state ')
+
+        local ferr, berr = jac.testIO(module, input)
+        mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+        mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
+
+        local ferr, berr = jac.testIO(module, input)
+        mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ', precision)
+        mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ', precision)
+
+      end
+   end
+   -- test against SpatialSubSampling
+   local from = math.random(1,6)
+   local ki = math.random(1,5)
+   local kj = math.random(1,5)
+   local si = math.random(1,4)
+   local sj = math.random(1,4)
+   local outi = math.random(6,10)
+   local outj = math.random(6,10)
+   local padW = 0
+   local padH = 0
+   local ini = (outi-1)*si+ki-2*padW
+   local inj = (outj-1)*sj+kj-2*padH
+
+   local module = nn.SpatialAveragePooling(ki, kj, si, sj, padW, padH)
+   local sap = nn.SpatialSubSampling(from, ki, kj, si, sj)
+   sap.weight:fill(1.0/(ki*kj))
+   sap.bias:fill(0.0)
+
+   local input = torch.Tensor(from, inj, ini):uniform()
+
+   local output = module:forward(input)
+   local gradInput = module:backward(input, output)
+   local output2 = sap:forward(input)
+   local gradInput2 = sap:updateGradInput(input, output)
+
+   mytester:assertTensorEq(output, output2, 0.000001, torch.typename(module) .. ' forward err ')
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001, torch.typename(module) .. ' backward err ')
+
+   -- test against SpatialSubSampling, batch mode
+   local batch = math.random(2,5)
+   outi = math.random(4,8)
+   outj = math.random(4,8)
+   local padW = 0
+   local padH = 0
+   local ini = (outi-1)*si+ki-2*padW
+   local inj = (outj-1)*sj+kj-2*padH
+
+   module = nn.SpatialAveragePooling(ki, kj, si, sj, padW, padH)
+   input = torch.Tensor(batch,from,inj,ini):uniform()
+
+   local sap = nn.SpatialSubSampling(from, ki, kj, si, sj)
+   sap.weight:fill(1.0/(ki*kj))
+   sap.bias:fill(0.0)
+
+   local output = module:forward(input)
+   local gradInput = module:backward(input, output)
+   local output2 = sap:forward(input)
+   local gradInput2 = sap:updateGradInput(input, output)
+
+   mytester:assertTensorEq(output, output2, 0.000001, torch.typename(module) .. ' forward err (Batch) ')
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001, torch.typename(module) .. ' backward err (Batch) ')
+
+end
+
+function nntest.SpatialAdaptiveMaxPooling()
+   local from = math.random(1,5)
+   local ki = math.random(1,5)
+   local kj = math.random(1,5)
+   local ini = math.random(1,16)
+   local inj = math.random(1,16)
+
+   local module = nn.SpatialAdaptiveMaxPooling(ki,kj)
+   local input = torch.rand(from,ini,inj)
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+
+   -- batch
+   local nbatch = math.random(1,3)
+   input = torch.rand(nbatch,from,ini,inj)
+   module = nn.SpatialAdaptiveMaxPooling(ki,kj)
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state (Batch) ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ', precision)
+
+   -- non-contiguous
+
+   input = torch.rand(from,ini,inj):transpose(2,3)
+   module = nn.SpatialAdaptiveMaxPooling(ki,kj)
+   local inputc = input:contiguous() -- contiguous
+   local output = module:forward(input):clone()
+   local outputc = module:forward(inputc):clone()
+   mytester:asserteq(0, (output-outputc):abs():max(), torch.typename(module) .. ' - non-contiguous err ')
+   local gradInput = module:backward(input, output):clone()
+   local gradInputc = module:backward(inputc, outputc):clone()
+   mytester:asserteq(0, (gradInput-gradInputc):abs():max(), torch.typename(module) .. ' - non-contiguous err ')
+
+   -- non-contiguous batch
+   local nbatch = math.random(1,3)
+   input = torch.rand(nbatch,from,ini,inj):transpose(1,3):transpose(2,4)
+   local inputc = input:contiguous() -- contiguous
+   module = nn.SpatialAdaptiveMaxPooling(ki,kj)
+
+   local output = module:forward(input):clone()
+   local outputc = module:forward(inputc):clone()
+   mytester:asserteq(0, (output-outputc):abs():max(), torch.typename(module) .. ' - batch non-contiguous err ')
+   local gradInput = module:backward(input, output):clone()
+   local gradInputc = module:backward(inputc, outputc):clone()
+   mytester:asserteq(0, (gradInput-gradInputc):abs():max(), torch.typename(module) .. ' - batch non-contiguous err ')
+
+end
+
+function nntest.SpatialAdaptiveAveragePooling()
+   local from = math.random(1,5)
+   local ki = math.random(1,5)
+   local kj = math.random(1,5)
+   local ini = math.random(1,16)
+   local inj = math.random(1,16)
+
+   local module = nn.SpatialAdaptiveAveragePooling(ki,kj)
+   local input = torch.rand(from,ini,inj)
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+
+   -- batch
+   local nbatch = math.random(1,3)
+   input = torch.rand(nbatch,from,ini,inj)
+   module = nn.SpatialAdaptiveAveragePooling(ki,kj)
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state (Batch) ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ', precision)
+
+   -- non-contiguous
+
+   input = torch.rand(from,ini,inj):transpose(2,3)
+   module = nn.SpatialAdaptiveAveragePooling(ki,kj)
+   local inputc = input:contiguous() -- contiguous
+   local output = module:forward(input):clone()
+   local outputc = module:forward(inputc):clone()
+   mytester:asserteq(0, (output-outputc):abs():max(), torch.typename(module) .. ' - non-contiguous err ')
+   local gradInput = module:backward(input, output):clone()
+   local gradInputc = module:backward(inputc, outputc):clone()
+   mytester:asserteq(0, (gradInput-gradInputc):abs():max(), torch.typename(module) .. ' - non-contiguous err ')
+
+   -- non-contiguous batch
+   local nbatch = math.random(1,3)
+   input = torch.rand(nbatch,from,ini,inj):transpose(1,3):transpose(2,4)
+   local inputc = input:contiguous() -- contiguous
+   module = nn.SpatialAdaptiveAveragePooling(ki,kj)
+
+   local output = module:forward(input):clone()
+   local outputc = module:forward(inputc):clone()
+   mytester:asserteq(0, (output-outputc):abs():max(), torch.typename(module) .. ' - batch non-contiguous err ')
+   local gradInput = module:backward(input, output):clone()
+   local gradInputc = module:backward(inputc, outputc):clone()
+   mytester:asserteq(0, (gradInput-gradInputc):abs():max(), torch.typename(module) .. ' - batch non-contiguous err ')
+
+end
+
+function nntest.SpatialLPPooling()
+   local fanin = math.random(1,4)
+   local osizex = math.random(1,4)
+   local osizey = math.random(1,4)
+   local p = 2
+   local mx = math.random(2,6)
+   local my = math.random(2,6)
+   local dx = math.random(2,mx)
+   local dy = math.random(2,my)
+   local sizex = osizex*mx
+   local sizey = osizey*my
+   local module = nn.SpatialLPPooling(fanin,p,mx,my,dx,dy)
+   local input = torch.rand(fanin,sizey,sizex)
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.Sum()
+   -- 1D
+   local ini = math.random(3,7)
+   local input = torch.Tensor(ini):zero()
+   local module = nn.Sum(1)
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   -- negative dimension
+   local module   = nn.Sum(-1)
+   local input    = torch.Tensor({1, 2, 3})
+   local expected = torch.Tensor({6})
+   local output   = module:forward(input)
+   mytester:assertlt(torch.norm(output-expected), precision, 'error on forward ')
+
+   -- batch
+   local dimension = 1
+   local module    = nn.Sum(dimension, 1)
+   local input     = torch.Tensor({{1, 2, 3},{4, 5, 6}})
+   local expected  = torch.Tensor({6, 15})
+   local output    = module:forward(input)
+   mytester:assertlt(torch.norm(output-expected), precision, 'error on forward ')
+
+   local err       = jac.testJacobian(module, input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   -- mean + batch
+   local dimension = 1
+   local module    = nn.Sum(dimension, 1, true)
+   local input     = torch.Tensor({{1, 2, 3},{4, 5, 6}})
+   local expected  = input:mean(dimension + 1)
+   local output    = module:forward(input)
+
+   mytester:assertlt(torch.norm(output-expected), precision, 'error on forward ')
+
+   local err       = jac.testJacobian(module, input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   -- squeeze
+   local dimension = 1
+   local module    = nn.Sum(dimension, nil, nil, false)
+   local input     = torch.Tensor({{1, 2, 3},{4, 5, 6}})
+   local expected  = torch.Tensor({5, 7, 9}):view(1, 3)
+   local output    = module:forward(input)
+
+   mytester:assertlt(torch.norm(output-expected), precision, 'error on forward ')
+   mytester:assert(output:isSameSizeAs(expected), 'sizes mismatch')
+
+   local err       = jac.testJacobian(module, input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   -- squeeze + batch
+   local dimension = 1
+   local module    = nn.Sum(dimension, 1, nil, false)
+   local input     = torch.Tensor({{1, 2, 3},{4, 5, 6}})
+   local expected  = torch.Tensor({6, 15}):view(2, 1)
+   local output    = module:forward(input)
+
+   mytester:assertlt(torch.norm(output-expected), precision, 'error on forward ')
+   mytester:assert(output:isSameSizeAs(expected), 'sizes mismatch')
+
+   local err       = jac.testJacobian(module, input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   -- 3D
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ini,inj,ink):zero()
+   local module = nn.Sum(torch.random(1,3))
+
+   local err = jac.testJacobian(module,input)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.Tanh()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ink, inj, ini):zero()
+
+   local module = nn.Tanh()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision ,  'error on state ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.TemporalConvolution()
+   -- 1D
+   local from = math.random(1,5)
+   local to = math.random(1,5)
+   local ki = math.random(1,5)
+   local si = math.random(1,4)
+   local outi = math.random(5,7)
+   local ini = (outi-1)*si+ki
+   local module = nn.TemporalConvolution(from, to, ki,si)
+   local input = torch.Tensor(ini, from):zero()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'error on weight [direct update]')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'error on bias [direct update]')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on bias [%s]', t))
+   end
+
+   -- 2D
+   local nBatchFrame = 4
+   local input = torch.Tensor(nBatchFrame, ini, from):zero()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'error on weight [direct update]')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'error on bias [direct update]')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on bias [%s]', t))
+   end
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
+
+   -- 2D matches 1D
+   local output = module:forward(input):clone()
+   local outputGrad = torch.randn(output:size())
+   local inputGrad = module:backward(input, outputGrad):clone()
+
+   local input1D = input:select(1, 2)
+   local output1D = module:forward(input1D)
+   local outputGrad1D = outputGrad:select(1, 2)
+   local inputGrad1D = module:backward(input1D, outputGrad1D)
+
+   mytester:assertTensorEq(output:select(1,2), output1D, 0.000001, 'error on 2D vs 1D forward)')
+   mytester:assertTensorEq(inputGrad:select(1,2), inputGrad1D, 0.000001, 'error on 2D vs 1D backward)')
+end
+
+function nntest.TemporalDynamicKMaxPooling()
+   local features = math.random(5,10)
+   local seqLen = math.random(6,9)
+   local minK = math.random(3,6)
+   local factor = math.random(1,100)*0.01
+   local nBatchFrame = math.random(2,4)
+   local module = nn.TemporalDynamicKMaxPooling(minK, factor)
+
+   -- 1D
+   local input = torch.Tensor(seqLen, features)
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+
+   -- 2D
+   local input = torch.Tensor(nBatchFrame, seqLen, features)
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:asserteq(0, ferr, torch.typename(module) .. ' - i/o forward err ')
+   mytester:asserteq(0, berr, torch.typename(module) .. ' - i/o backward err ')
+
+   -- 2D matches 1D
+   local output = module:forward(input):clone()
+   local outputGrad = torch.randn(output:size())
+   local inputGrad = module:backward(input, outputGrad):clone()
+
+   local input1D = input:select(1, 2)
+   local output1D = module:forward(input1D)
+   local outputGrad1D = outputGrad:select(1, 2)
+   local inputGrad1D = module:backward(input1D, outputGrad1D)
+
+   mytester:assertTensorEq(output:select(1,2), output1D, 0.000001, 'error on 2D vs 1D forward)')
+   mytester:assertTensorEq(inputGrad:select(1,2), inputGrad1D, 0.000001, 'error on 2D vs 1D backward)')
+
+
+end
+
+function nntest.TemporalSubSampling()
+   local from = math.random(1,5)
+   local ki = math.random(1,6)
+   local si = math.random(1,4)
+   local outi = math.random(6,9)
+   local ini = (outi-1)*si+ki
+   local module = nn.TemporalSubSampling(from, ki, si)
+   local input = torch.Tensor(ini, from):zero()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'error on bias [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on bias [%s]', t))
+   end
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+
+function nntest.TemporalRowConvolution()
+  if true then return end -- until this unit test is fixed...
+  local from = math.random(1,5)
+  local ki = math.random(1,5)
+  local si = math.random(1,2)
+  local outi = math.random(5,7)
+  local ini = (outi-1)*si+ki
+
+  local function jacTest(module)
+
+    local input
+    if module.featFirst then
+      input = torch.Tensor(from, ini):zero()
+    else
+      input = torch.Tensor(ini, from):zero()
+    end
+
+    -- 1D
+    local err = jac.testJacobian(module, input)
+    mytester:assertlt(err, precision, "error on state" )
+
+    local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+    mytester:assertlt(err, precision, "error on weight ")
+
+    if module.bias then
+      local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+      mytester:assertlt(err, precision, "error on bias ")
+    end
+
+    local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+    mytester:assertlt(err, precision, "error on weight [direct update] ")
+
+    if module.bias then
+      local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+      mytester:assertlt(err, precision, "error on bias [direct update] ")
+    end
+
+    for t, err in pairs(jac.testAllUpdate(module, input, "weight", "gradWeight")) do
+      mytester:assertlt(err, precision, string.format(
+          "error on weight [%s] ", t))
+    end
+
+    if module.bias then
+      for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+        mytester:assertlt(err, precision, string.format(
+            "error on bias [%s] ", t))
+      end
+    end
+
+    -- 2D
+    local nBatchFrame = 4
+    if module.featFirst then
+      input = torch.Tensor(nBatchFrame, from, ini):zero()
+    else
+      input = torch.Tensor(nBatchFrame, ini, from):zero()
+    end
+
+
+    local err = jac.testJacobian(module, input)
+    mytester:assertlt(err, precision, "error on state" )
+
+    local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+    mytester:assertlt(err, precision, "error on weight ")
+
+    if module.bias then
+      local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+      mytester:assertlt(err, precision, "error on bias ")
+    end
+
+    local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+    mytester:assertlt(err, precision, "error on weight [direct update] ")
+
+    if module.bias then
+      local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+      mytester:assertlt(err, precision, "error on bias [direct update] ")
+    end
+
+    for t, err in pairs(jac.testAllUpdate(module, input, "weight", "gradWeight")) do
+      mytester:assertlt(err, precision, string.format(
+          "error on weight [%s] ", t))
+    end
+
+    if module.bias then
+      for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+        mytester:assertlt(err, precision, string.format(
+            "error on bias [%s] ", t))
+      end
+    end
+
+    local ferr, berr = jac.testIO(module, input)
+    mytester:eq(0, ferr, torch.typename(module) .. " - i/o forward err ", precision)
+    mytester:eq(0, berr, torch.typename(module) .. " - i/o forward err ", precision)
+
+    -- 2D matches 1D
+    local output = module:forward(input):clone()
+    local outputGrad = torch.randn(output:size())
+    local inputGrad = module:backward(input, outputGrad):clone()
+
+    local input1D = input:select(1, 2)
+    local output1D = module:forward(input1D)
+    local outputGrad1D = outputGrad:select(1, 2)
+    local inputGrad1D = module:backward(input1D, outputGrad1D)
+
+    mytester:assertTensorEq(output:select(1,2), output1D, 0.000001,
+    "error on 2D vs 1D forward")
+    mytester:assertTensorEq(inputGrad:select(1,2), inputGrad1D, 0.000001,
+    "error on 2D vs 1D backward")
+  end
+
+  local module = nn.TemporalRowConvolution(from, ki, si)
+  jacTest(module)
+  module:noBias()
+  jacTest(module)
+  module.bias = torch.Tensor(module.inputFrameSize):zero()
+  module.gradBias = torch.Tensor(module.inputFrameSize):zero()
+  module:reset()
+  module.featFirst = true
+  jacTest(module)
+  module:noBias()
+  jacTest(module, true)
+end
+
+function nntest.TemporalMaxPooling()
+   local from = math.random(2,4)
+   local ki = math.random(5,7)
+   local si = math.random(1,2)
+   local outi = math.random(30,40)
+   local ini = (outi-1)*si+ki
+   local module = nn.TemporalMaxPooling(ki, si)
+   local input = torch.Tensor(ini, from):zero()
+
+   -- 1D
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
+
+   -- 2D
+   local nBatchFrame = 2
+   local input = torch.Tensor(nBatchFrame, ini, from):zero()
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
+
+   -- 2D matches 1D
+   local output = module:forward(input):clone()
+   local outputGrad = torch.randn(output:size())
+   local inputGrad = module:backward(input, outputGrad):clone()
+
+   local input1D = input:select(1, 2)
+   local output1D = module:forward(input1D)
+   local outputGrad1D = outputGrad:select(1, 2)
+   local inputGrad1D = module:backward(input1D, outputGrad1D)
+
+   mytester:assertTensorEq(output:select(1,2), output1D, 0.000001, 'error on 2D vs 1D forward)')
+   mytester:assertTensorEq(inputGrad:select(1,2), inputGrad1D, 0.000001, 'error on 2D vs 1D backward)')
+end
+
+function nntest.VolumetricFullConvolution_simple_test()
+    local module = nn.VolumetricFullConvolution(3, 1, 3, 3, 3, 3, 3, 3);
+    module.weight:fill(1);
+    module.bias:fill(0.1);
+
+    local input = torch.Tensor(1, 3, 2, 2, 2):zero();
+    for c = 1,3 do
+        input[1][c][1][1][1] = 1
+    end
+    local output = module:forward(input)
+    for t = 1,6 do
+        for h = 1,6 do
+            for w = 1,6 do
+                if t <= 3 and h <= 3 and w <= 3 then
+                    mytester:assertlt(output[1][1][t][h][w] - 3.1, precision, 'error on forward ')
+                else
+                    mytester:assertlt(output[1][1][t][h][w] - 0.1, precision, 'error on forward ')
+                end
+            end
+        end
+    end
+
+    module:zeroGradParameters()
+    local gradOut = torch.Tensor(1, 1, 6, 6, 6):fill(0.1);
+    local gradIn = module:backward(input, gradOut)
+    for t = 1,2 do
+        for h = 1,2 do
+            for w = 1,2 do
+                mytester:assertlt(gradIn[1][1][t][h][w] - 2.7, precision,
+                                  'error on backward input gradients ')
+            end
+        end
+    end
+
+    mytester:assertlt(module.gradBias[1] - 21.6, precision,
+                      'error on backward gradBias ')
+    for c = 1,3 do
+        for t = 1,3 do
+            for h = 1,3 do
+                for w = 1,3 do
+                    mytester:assertlt(module.gradWeight[c][1][t][h][w] - 0.1, precision,
+                                      'error on backward weight gradients ')
+                end
+            end
+        end
+    end
+end
+
+function nntest.VolumetricFullConvolution()
+    local from = math.random(2,3)
+    local to = math.random(2,3)
+    local kt = math.random(3,4)
+    local ki = math.random(3,4)
+    local kj = ki
+    local st = math.random(1,3)
+    local si = math.random(1,3)
+    local sj = si
+    local int = math.random(3,4)
+    local ini = math.random(3,4)
+    local inj = math.random(3,4)
+    local bs = math.random(1, 6)
+    local module = nn.VolumetricFullConvolution(from, to, kt, ki, kj, st, si, sj)
+
+    local input = torch.Tensor(bs, from, int, ini, inj):zero()
+
+    local function jacTests(module)
+      local err = jac.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'error on state ')
+
+      local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+      mytester:assertlt(err , precision, 'error on weight ')
+
+      if module.bias then
+        local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+        mytester:assertlt(err , precision, 'error on bias ')
+      end
+
+      local ferr, berr = jac.testIO(module, input)
+      mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+      mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
+    end
+
+    jacTests(module)
+    module:noBias()
+    jacTests(module)
+    module.bias = torch.Tensor(module.nOutputPlane):zero()
+    module.gradBias = torch.Tensor(module.nOutputPlane):zero()
+    module:reset()
+    jacTests(module)
+end
+
+function nntest.VolumetricFullConvolutionDualInput()
+   local from = math.random(2,3)
+   local to = math.random(2,3)
+   local kt = math.random(3,4)
+   local ki = math.random(3,4)
+   local kj = math.random(3,4)
+   local dt =  math.random(1,3)
+   local di =  math.random(1,3)
+   local dj =  math.random(1,3)
+   local padT = math.random(0,2)
+   local padW = math.random(0,2)
+   local padH = math.random(0,2)
+   local outt = math.random(5,9)
+   local outi = math.random(5,9)
+   local outj = math.random(5,9)
+   local int = math.floor((outt + padT*2 - kt)/dt + 1)
+   local ini = math.floor((outi + padW*2 - ki)/di + 1)
+   local inj = math.floor((outj + padH*2 - kj)/dj + 1)
+   local adjT = (outt + 2 * padT - kt) % dt
+   local adjW = (outi + 2 * padW - ki) % di
+   local adjH = (outj + 2 * padH - kj) % dj
+   local targetTensor = torch.Tensor(outt, outj, outi):zero()
+   local input = torch.Tensor(from, int, inj, ini):zero()
+
+   local module = nn.VolumetricFullConvolution(from, to, kt, ki, kj, dt, di, dj, padT, padW, padH)
+   local moduleRef = nn.VolumetricFullConvolution(from, to, kt, ki, kj, dt, di, dj, padT, padW, padH, adjT, adjW, adjH)
+   moduleRef.weight:copy(module.weight)
+   moduleRef.bias:copy(module.bias)
+
+   -- Check that the required output size matches the actual output size
+   -- when using the dual input mode
+   local output = module:forward({input, targetTensor})
+   mytester:asserteq(output:size(2), outt, 'output depth error')
+   mytester:asserteq(output:size(3), outj, 'output height error')
+   mytester:asserteq(output:size(4), outi, 'output width error')
+
+   -- Check that backward and forward match the reference module
+   local outputRef = moduleRef:forward(input)
+   mytester:asserteq(0, (output-outputRef):abs():max(), torch.typename(module) .. ' - output err ')
+   local gradOutput = outputRef:clone():uniform()
+   local gradInputRef = moduleRef:backward(input, gradOutput)
+   local gradInput = module:backward({input, targetTensor}, gradOutput)
+   mytester:asserteq(0, (gradInput[1]-gradInputRef):abs():max(), torch.typename(module) .. ' - gradInput[1] err ')
+
+   -- Check that gradInput[2] is the singleton tensor {0}
+   mytester:asserteq(gradInput[2]:storage():size(), 1, torch.typename(module) .. ' - gradInput[2] size err ')
+   mytester:asserteq(gradInput[2]:storage()[1], 0, torch.typename(module) .. ' - gradInput[2] value err ')
+end
+
+function nntest.VolumetricConvolution()
+   local from = math.random(2,4)
+   local to = math.random(1,4)
+   local kt = math.random(1,4)
+   local ki = math.random(1,4)
+   local kj = math.random(1,4)
+   local st = math.random(1,3)
+   local si = math.random(1,3)
+   local sj = math.random(1,3)
+   local padT = math.random(0,2)
+   local padW = math.random(0,2)
+   local padH = math.random(0,2)
+   local outt = math.random(5,7)
+   local outi = math.random(5,7)
+   local outj = math.random(5,7)
+   local int = (outt-1)*st+kt-padT*2
+   local ini = (outi-1)*si+ki-padW*2
+   local inj = (outj-1)*sj+kj-padH*2
+   local module = nn.VolumetricConvolution(from, to, kt, ki, kj, st, si, sj, padT, padW, padH)
+   local input = torch.Tensor(from, int, inj, ini):zero()
+
+   local function jacTests(module)
+     local err = jac.testJacobian(module, input)
+     mytester:assertlt(err, precision, 'error on state ')
+
+     local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+     mytester:assertlt(err , precision, 'error on weight ')
+
+     if module.bias then
+       local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+       mytester:assertlt(err , precision, 'error on bias ')
+     end
+
+     local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+     mytester:assertlt(err , precision, 'error on weight [direct update] ')
+
+     if module.bias then
+       local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+       mytester:assertlt(err , precision, 'error on bias [direct update] ')
+     end
+
+     for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+        mytester:assertlt(err, precision, string.format(
+                           'error on weight [%s]', t))
+     end
+
+     if module.bias then
+       for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+         mytester:assertlt(err, precision, string.format(
+                            'error on bias [%s]', t))
+       end
+     end
+
+     local ferr, berr = jac.testIO(module, input)
+     mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+     mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
+   end
+
+   jacTests(module)
+   module:noBias()
+   jacTests(module)
+   module.bias = torch.Tensor(module.nOutputPlane):zero()
+   module.gradBias = torch.Tensor(module.nOutputPlane):zero()
+   module:reset()
+   jacTests(module)
+end
+
+function nntest.VolumetricDilatedConvolution()
+   local from = math.random(1,5)
+   local to = math.random(1,5)
+   local ki = math.random(1,5)
+   local kj = math.random(1,5)
+   local kk = math.random(1,5)
+   local di =  math.random(1,4)
+   local dj =  math.random(1,4)
+   local dk =  math.random(1,4)
+   local padW = 0 -- math.random(0,2)
+   local padH = 0 -- math.random(0,2)
+   local padT = 0 -- math.random(0,2)
+   local outi = math.random(2,3)
+   local outj = math.random(2,5)
+   local outk = math.random(2,5)
+   local dilationW = math.random(1,3)
+   local dilationH = math.random(1,3)
+   local dilationT = math.random(1,3)
+   local ini = (outi - 1) * di - 2 * padW + dilationW * (ki-1) + 1
+   local inj = (outj - 1) * dj - 2 * padH + dilationH * (kj-1) + 1
+   local ink = (outk - 1) * dk - 2 * padT + dilationT * (kk-1) + 1
+
+   local module = nn.VolumetricDilatedConvolution(from, to, kk, ki, kj, dk, di, dj, padT, padW, padH, dilationT, dilationW, dilationH)
+   local input = torch.Tensor(from, ink, inj, ini):zero()
+
+   -- stochastic
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'error on bias [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on bias [%s]', t))
+   end
+
+   -- batch
+
+   --verbose = true
+   local batch = math.random(2,5)
+
+   module = nn.VolumetricDilatedConvolution(from, to, kk, ki, kj, dk, di, dj, padT, padW, padH, dilationT, dilationW, dilationH)
+   input = torch.Tensor(batch,from,ink,inj,ini):zero()
+
+   -- Check that the required output size matches the actual output size
+   local output = module:forward(input)
+   mytester:asserteq(output:size(3), outk, 'output width error')
+   mytester:asserteq(output:size(4), outj, 'output height error')
+   mytester:asserteq(output:size(5), outi, 'output width error')
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'batch error on state ')
+
+   local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight)
+   mytester:assertlt(err , precision, 'batch error on weight ')
+
+   local err = jac.testJacobianParameters(module, input, module.bias, module.gradBias)
+   mytester:assertlt(err , precision, 'batch error on bias ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+   mytester:assertlt(err , precision, 'batch error on weight [direct update] ')
+
+   local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+   mytester:assertlt(err , precision, 'batch error on bias [direct update] ')
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+      mytester:assertlt(err, precision, string.format(
+                         'error on weight [%s]', t))
+   end
+
+   for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+      mytester:assertlt(err, precision, string.format(
+                         'batch error on bias [%s]', t))
+   end
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
+
+   -- non-contiguous
+   local input = torch.randn(batch,from,ink,ini,inj):transpose(4,5) -- non-contiguous
+   local inputc = input:contiguous() -- contiguous
+   local output = module:forward(input)
+   local outputc = module:forward(inputc)
+   mytester:asserteq(0, (output-outputc):abs():max(), torch.typename(module) .. ' - contiguous err ')
+   local gradInput = module:backward(input, output)
+   local gradInputc = module:backward(inputc, outputc)
+   mytester:asserteq(0, (gradInput-gradInputc):abs():max(), torch.typename(module) .. ' - contiguous err ')
+end
+
+function nntest.VolumetricConvolutionBatchCompare()
+   local from = math.random(2,3)
+   local to = math.random(2,3)
+   local kt = math.random(3,4)
+   local ki = math.random(3,4)
+   local kj = math.random(3,4)
+   local st = math.random(2,3)
+   local si = math.random(2,3)
+   local sj = math.random(2,3)
+   local padT = math.random(0,2)
+   local padW = math.random(0,2)
+   local padH = math.random(0,2)
+   local outt = math.random(3,4)
+   local outi = math.random(3,4)
+   local outj = math.random(3,4)
+   local int = (outt-1)*st+kt-padT*2
+   local ini = (outi-1)*si+ki-padW*2
+   local inj = (outj-1)*sj+kj-padH*2
+   local module = nn.VolumetricConvolution(from, to, kt, ki, kj, st, si, sj, padT, padW, padH)
+   module:zeroGradParameters()
+   local input = torch.randn(from, int, inj, ini)
+   batchcompare(module,input, {'weight','bias','gradWeight','gradBias'})
+end
+
+function nntest.VolumetricAveragePooling()
+   local from = math.random(2,3)
+   local kt = math.random(3,4)
+   local ki = math.random(3,4)
+   local kj = math.random(3,4)
+   local st = math.random(2,3)
+   local si = math.random(2,3)
+   local sj = math.random(2,3)
+   local outt = math.random(3,4)
+   local outi = math.random(3,4)
+   local outj = math.random(3,4)
+   local int = (outt-1)*st+kt
+   local ini = (outi-1)*si+ki
+   local inj = (outj-1)*sj+kj
+   local module = nn.VolumetricAveragePooling(kt, ki, kj, st, si, sj)
+   local input = torch.Tensor(from, int, inj, ini):zero()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
+
+      -- batch
+   local nbatch = math.random(2,3)
+   module = nn.VolumetricAveragePooling(kt, ki, kj, st, si, sj)
+   input = torch.Tensor(nbatch, from, int, inj, ini):zero()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state (Batch) ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ', precision)
+end
+
+function nntest.VolumetricMaxPooling()
+   local from = math.random(2,3)
+   local kt = math.random(3,4)
+   local ki = math.random(3,4)
+   local kj = math.random(3,4)
+   local st = math.random(2,3)
+   local si = math.random(2,3)
+   local sj = math.random(2,3)
+   local outt = math.random(3,4)
+   local outi = math.random(3,4)
+   local outj = math.random(3,4)
+   local padT = math.min(math.random(0,2),math.floor(kt/2))
+   local padW = math.min(math.random(0,2),math.floor(ki/2))
+   local padH =  math.min(math.random(0,2),math.floor(kj/2))
+   local int = (outt-1)*st+kt-2*padT
+   local ini = (outi-1)*si+ki-2*padW
+   local inj = (outj-1)*sj+kj-2*padH
+   local module = nn.VolumetricMaxPooling(kt, ki, kj, st, si, sj, padT, padW, padH)
+   local input = torch.Tensor(from, int, inj, ini):zero()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(0, ferr, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(0, berr, torch.typename(module) .. ' - i/o backward err ', precision)
+
+   -- batch
+   local nbatch = math.random(2,3)
+   module = nn.VolumetricMaxPooling(kt, ki, kj, st, si, sj, padT, padW, padH)
+   input = torch.Tensor(nbatch, from, int, inj, ini):zero()
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state (Batch) ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ', precision)
+end
+
+function nntest.VolumetricDilatedMaxPooling()
+   for _,ceil_mode in pairs({true,false}) do
+      local from = math.random(2,3)
+      local kt = math.random(3,4)
+      local ki = math.random(3,4)
+      local kj = math.random(3,4)
+      local st = math.random(2,3)
+      local si = math.random(2,3)
+      local sj = math.random(2,3)
+      local outt = math.random(3,4)
+      local outi = math.random(3,4)
+      local outj = math.random(3,4)
+      local padT = math.min(math.random(0,1),math.floor(kt/2))
+      local padW = math.min(math.random(0,1),math.floor(ki/2))
+      local padH =  math.min(math.random(0,1),math.floor(kj/2))
+      local dilationT = math.random(1,3)
+      local dilationW = math.random(1,3)
+      local dilationH = math.random(1,3)
+      local int = (outt-1)*st+(dilationT*(kt-1)+1)-2*padT
+      local ini = (outi-1)*si+(dilationW*(ki-1)+1)-2*padW
+      local inj = (outj-1)*sj+(dilationH*(kj-1)+1)-2*padH
+
+      local ceil_string = ceil_mode and 'ceil' or 'floor'
+      local module = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padT,padW,padH,dilationT,dilationW,dilationH)
+      if ceil_mode then module:ceil() else module:floor() end
+      local input = torch.rand(from,int,inj,ini)
+
+      local err = jac.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'error '..ceil_string..' mode on state ')
+
+      local ferr, berr = jac.testIO(module, input)
+      mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err ')
+      mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err ')
+
+      -- batch
+      local nbatch = math.random(2,5)
+      input = torch.rand(nbatch,from,int,inj,ini)
+      module = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padT,padW,padH,dilationT,dilationW,dilationH)
+      if ceil_mode then module:ceil() else module:floor() end
+
+      local err = jac.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'error '..ceil_string..' mode on state (Batch)')
+
+      local ferr, berr = jac.testIO(module, input)
+      mytester:asserteq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ')
+      mytester:asserteq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ')
+  end
+end
+
+function nntest.VolumetricFractionalMaxPooling()
+   local batch = math.random(1, 3)
+   local plane = math.random(1, 3)
+   local outT = math.random(1, 7)
+   local outW = math.random(1, 7)
+   local outH = math.random(1, 7)
+   local poolSizeT = math.random(2, 4)
+   local poolSizeW = math.random(2, 4)
+   local poolSizeH = math.random(2, 4)
+
+   local minInT = outT + poolSizeT
+   local minInW = outW + poolSizeW
+   local minInH = outH + poolSizeH
+
+   local inT = math.random(minInT, minInT + 6)
+   local inW = math.random(minInW, minInW + 6)
+   local inH = math.random(minInH, minInH + 6)
+
+   -- fix the pooling regions so they aren't regenerated with every
+   -- forward(), so testJacobian can work properly
+   local module =
+      nn.VolumetricFractionalMaxPooling(poolSizeT, poolSizeW, poolSizeH, outT, outW, outH)
+      :fixPoolingRegions()
+   local input = nil
+   if batch == 1 then
+      input = torch.Tensor(plane, inH, inW, inT):zero()
+   else
+      input = torch.Tensor(batch, plane, inH, inW, inT):zero()
+   end
+
+   local err = nn.Jacobian.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on state')
+end
+
+function nntest.VolumetricFractionalMaxPooling_Ratio()
+   -- Fix a reduction ratio, and test with two different input sizes
+   local reductionRatioT = torch.uniform(0.4, 0.74)
+   local reductionRatioW = torch.uniform(0.4, 0.74)
+   local reductionRatioH = torch.uniform(0.4, 0.74)
+
+   for tries = 1, 2 do
+      local batch = math.random(1, 3)
+      local plane = math.random(1, 3)
+      local poolSizeT = math.random(2, 3)
+      local poolSizeW = math.random(2, 3)
+      local poolSizeH = math.random(2, 3)
+
+      local minInT = math.random(5, 8) + poolSizeT
+      local minInW = math.random(5, 8) + poolSizeW
+      local minInH = math.random(5, 8) + poolSizeH
+
+      local inT = math.random(minInT, minInT + 6)
+      local inW = math.random(minInW, minInW + 6)
+      local inH = math.random(minInH, minInH + 6)
+
+      -- fix the pooling regions so they aren't regenerated with every
+      -- forward(), so testJacobian can work properly
+      local module =
+         nn.VolumetricFractionalMaxPooling(poolSizeT, poolSizeW, poolSizeH,
+                                        reductionRatioT, reductionRatioW,
+                                        reductionRatioH)
+         :fixPoolingRegions()
+      local input = nil
+      if batch == 1 then
+         input = torch.Tensor(plane, inH, inW, inT):zero()
+      else
+         input = torch.Tensor(batch, plane, inH, inW, inT):zero()
+      end
+
+      -- Make sure that the output size is based on our ratio
+      local output = module:updateOutput(input)
+      if batch == 1 then
+         mytester:asserteq(output:size(4), math.floor(reductionRatioT * inT))
+         mytester:asserteq(output:size(3), math.floor(reductionRatioW * inW))
+         mytester:asserteq(output:size(2), math.floor(reductionRatioH * inH))
+      else
+         mytester:asserteq(output:size(5), math.floor(reductionRatioT * inT))
+         mytester:asserteq(output:size(4), math.floor(reductionRatioW * inW))
+         mytester:asserteq(output:size(3), math.floor(reductionRatioH * inH))
+      end
+
+      local err = nn.Jacobian.testJacobian(module, input)
+      mytester:assertlt(err, precision, 'error on state')
+   end
+end
+
+function nntest.VolumetricMaxUnpooling()
+   local from = math.random(2,3)
+   local kt = math.random(3,4)
+   local ki = math.random(3,4)
+   local kj = math.random(3,4)
+   local st, si, sj = kt, ki, kj
+   local outt = math.random(3,4)
+   local outi = math.random(3,4)
+   local outj = math.random(3,4)
+   local padT = math.min(math.random(0,2),math.floor(kt/2))
+   local padW = math.min(math.random(0,2),math.floor(ki/2))
+   local padH = math.min(math.random(0,2),math.floor(kj/2))
+   local int = (outt-1)*st+kt-2*padT
+   local ini = (outi-1)*si+ki-2*padW
+   local inj = (outj-1)*sj+kj-2*padH
+
+   local poolingModule = nn.VolumetricMaxPooling(kt, ki, kj, st, si, sj, padT, padW, padH)
+   local module = nn.VolumetricMaxUnpooling(poolingModule)
+
+   local original = torch.rand(from,int,inj,ini)
+   local input = poolingModule:forward(original)
+   local output = module:forward(input)
+   mytester:assert(output:isSameSizeAs(original),'VolumetricMaxUnpooling output size err')
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error ')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+
+   -- batch
+   local nbatch = math.random(2,3)
+   original = torch.rand(nbatch,from,int,inj,ini)
+   input = poolingModule:forward(original)
+   output = module:forward(input)
+
+   mytester:assert(output:isSameSizeAs(original),'VolumetricMaxUnpooling batch output size err')
+
+   local err = jac.testJacobian(module, input)
+   mytester:assertlt(err, precision, 'error on Batch')
+
+   local ferr, berr = jac.testIO(module, input)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err (Batch) ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err (Batch) ', precision)
+end
+
+function nntest.VolumetricMaxPooling_boundary()
+   -- simple kernel 2x2x2 with striding 2x2x2
+   local module = nn.VolumetricMaxPooling(2, 2, 2, 2, 2, 2):ceil()
+   local nip = math.random(3,256)
+   local input = torch.rand(nip, 2, 7, 7)
+
+   -- do a forward pass
+   local output = module:forward(input)
+
+   -- checking output size
+   mytester:asserteq(output:size(1), nip, 'wrong output channels')
+   mytester:asserteq(output:size(2), 1, 'wrong output temporal length')
+   mytester:asserteq(output:size(3), 4, 'wrong output height')
+   mytester:asserteq(output:size(4), 4, 'wrong output width')
+
+   -- checking output signals at top right
+   for c = 1,nip do
+      local max_val = input[c][1][1][7]
+      for t = 1,2 do
+        for h = 1,2 do
+          max_val = math.max(max_val, input[c][t][h][7])
+        end
+      end
+      mytester:asserteq(output[c][1][1][4], max_val, 'wrong forward execution')
+   end
+   -- checking output signals at bottom left
+   for c = 1,nip do
+       local max_val = input[c][1][7][1]
+       for t = 1,2 do
+         for w = 1,2 do
+           max_val = math.max(max_val, input[c][t][7][w])
+         end
+       end
+       mytester:asserteq(output[c][1][4][1], max_val, 'wrong forward execution')
+   end
+
+   -- check output signals at right bottom
+    for c = 1,nip do
+      local max_val = math.max(input[c][1][7][7], input[c][2][7][7])
+      mytester:asserteq(output[c][1][4][4], max_val, 'wrong forward execution')
+    end
+
+
+   -- backward is supposed to be tested in nntest.VolumetricMaxPooling
+   -- This is only test the boundary cases
+end
+
+function nntest.Module_getParameters_1()
+   local n = nn.Sequential()
+   n:add( nn.Linear(10,10) )
+   local p = n:getParameters()
+
+   mytester:asserteq((p[{ {1,100} }] - n.modules[1].weight):norm(), 0, 'getParameters(): weights wrong')
+   mytester:asserteq((p[{ {101,110} }] - n.modules[1].bias):norm(), 0, 'getParameters(): bias wrong')
+end
+
+function nntest.Module_getParameters_2()
+   local n = nn.Sequential()
+   n:add( nn.Linear(10,10) )
+   local _ = n:getParameters()
+
+   n:add( nn.Linear(10,10) )
+   local p = n:getParameters()
+
+   mytester:asserteq((p[{ {111,210} }] - n.modules[2].weight):norm(), 0, 'error when appending new module')
+   mytester:asserteq((p[{ {211,220} }] - n.modules[2].bias):norm(), 0, 'error when appending new module')
+end
+
+function nntest.Module_getParameters_3()
+   local n = nn.Sequential()
+   n:add( nn.Linear(10,10) )
+   n:add( n.modules[1]:clone() )
+   local p = n:getParameters()
+
+   mytester:asserteq((p[{ {1,100} }] - n.modules[1].weight):norm(), 0, 'error when using cloning')
+   mytester:asserteq((p[{ {101,110} }] - n.modules[1].bias):norm(), 0, 'error when using cloning')
+
+   mytester:asserteq((p[{ {111,210} }] - n.modules[2].weight):norm(), 0, 'error when using cloning')
+   mytester:asserteq((p[{ {211,220} }] - n.modules[2].bias):norm(), 0, 'error when using cloning')
+
+   mytester:asserteq((p[{ {111,210} }] - n.modules[1].weight):norm(), 0, 'error when using cloning')
+   mytester:asserteq((p[{ {211,220} }] - n.modules[1].bias):norm(), 0, 'error when using cloning')
+
+   n:reset()
+
+   mytester:assertgt((p[{ {111,210} }] - n.modules[1].weight):norm(), 0, 'error when using cloning')
+   mytester:assertgt((p[{ {211,220} }] - n.modules[1].bias):norm(), 0, 'error when using cloning')
+end
+
+function nntest.Module_getParameters_4()
+   local n = nn.Sequential()
+   n:add( nn.Linear(10,10) )
+   n:add( n.modules[1]:clone() )
+   local _ = n:getParameters()
+
+   n:add(nn.Linear(10,10))
+   local p = n:getParameters()
+
+   mytester:asserteq((p[{ {1,100} }] - n.modules[1].weight):norm(), 0, 'error when using cloning')
+   mytester:asserteq((p[{ {101,110} }] - n.modules[1].bias):norm(), 0, 'error when using cloning')
+
+   mytester:asserteq((p[{ {111,210} }] - n.modules[2].weight):norm(), 0, 'error when using cloning')
+   mytester:asserteq((p[{ {211,220} }] - n.modules[2].bias):norm(), 0, 'error when using cloning')
+
+   mytester:asserteq((p[{ {221,320} }] - n.modules[3].weight):norm(), 0, 'error when using cloning')
+   mytester:asserteq((p[{ {321,330} }] - n.modules[3].bias):norm(), 0, 'error when using cloning')
+
+   mytester:asserteq(p:nElement(), 3*(10*10+10), 'error: incorrect number of elements in flat vector')
+end
+
+function nntest.Module_getParameters_5()
+   local n = nn.Sequential()
+   n:add( nn.Linear(10,10) )
+   n:add( n.modules[1]:clone('weight','bias','gradWeight','gradBias') )
+   local p = n:getParameters()
+
+   mytester:asserteq((p[{ {1,100} }] - n.modules[1].weight):norm(), 0, 'error when using cloning+sharing')
+   mytester:asserteq((p[{ {101,110} }] - n.modules[1].bias):norm(), 0, 'error when using cloning+sharing')
+
+   mytester:asserteq((p[{ {1,100} }] - n.modules[2].weight):norm(), 0, 'error when using cloning+sharing')
+   mytester:asserteq((p[{ {101,110} }] - n.modules[2].bias):norm(), 0, 'error when using cloning+sharing')
+
+   n:reset()
+
+   mytester:asserteq((p[{ {1,100} }] - n.modules[2].weight):norm(), 0, 'error when using cloning+sharing')
+   mytester:asserteq((p[{ {101,110} }] - n.modules[2].bias):norm(), 0, 'error when using cloning+sharing')
+
+   mytester:asserteq(p:nElement(), (10*10+10), 'error: incorrect number of elements in flat vector')
+end
+
+function nntest.Module_getParameters_6()
+   local n = nn.Sequential()
+   n:add( nn.Linear(10,10) )
+   n:add( n.modules[1]:clone('weight','bias','gradWeight','gradBias') )
+   local _ = n:getParameters()
+
+   n:add(nn.Linear(10,10))
+   local p = n:getParameters()
+
+   mytester:asserteq((p[{ {1,100} }] - n.modules[1].weight):norm(), 0, 'error when using cloning+sharing')
+   mytester:asserteq((p[{ {101,110} }] - n.modules[1].bias):norm(), 0, 'error when using cloning+sharing')
+
+   mytester:asserteq((p[{ {1,100} }] - n.modules[2].weight):norm(), 0, 'error when using cloning+sharing')
+   mytester:asserteq((p[{ {101,110} }] - n.modules[2].bias):norm(), 0, 'error when using cloning+sharing')
+
+   mytester:asserteq((p[{ {111,210} }] - n.modules[3].weight):norm(), 0, 'error when using cloning+sharing')
+   mytester:asserteq((p[{ {211,220} }] - n.modules[3].bias):norm(), 0, 'error when using cloning+sharing')
+
+   mytester:asserteq(p:nElement(), 2*(10*10+10), 'error: incorrect number of elements in flat vector')
+end
+
+function nntest.Module_getParameters_7()
+   local n = nn.Sequential()
+   n:add( nn.Linear(10,10) )
+   n:add( n.modules[1]:clone('weight','bias','gradWeight','gradBias') )
+   local _ = n:getParameters()
+
+   n:add(nn.Linear(10,10))
+   local _ = n:getParameters()
+
+   local n1 = nn.Sequential()
+   n1:add( nn.Linear(10,10) )
+
+   local n2 = nn.Sequential()
+   n2:add( nn.Linear(10,10) )
+
+   local n = nn.Sequential()
+   n:add( n1 )
+   n:add( n2 )
+
+   local _ = n:getParameters()
+
+   local nf = nn.Sequential()
+   nf:add( n1 )
+   nf:add( nn.Linear(10,1) )
+
+   local p = nf:getParameters()
+
+   mytester:asserteq((p[{ {1,100} }] - n1.modules[1].weight):norm(), 0, 'error when using cloning+partial realloc')
+   mytester:asserteq((p[{ {101,110} }] - n1.modules[1].bias):norm(), 0, 'error when using cloning+partial realloc')
+
+   mytester:asserteq((p[{ {111,120} }] - nf.modules[2].weight):norm(), 0, 'error when using cloning+partial realloc')
+   mytester:asserteq((p[{ {121,121} }] - nf.modules[2].bias):norm(), 0, 'error when using cloning+partial realloc')
+
+   mytester:asserteq(p:nElement(), 121, 'error: incorrect number of elements in flat vector')
+end
+
+function nntest.Module_getParameters_8()
+   local function makeMLP(nin, ns)
+      local net = nn.Sequential()
+
+      for k,v in ipairs(ns) do
+         net:add(nn.Linear(nin, v))
+         nin = v
+      end
+      local _,_ = net:getParameters()
+      return net
+   end
+
+  local mlp1 = makeMLP(10, {10,10})
+  local mlp2 = makeMLP(10, {10,10})
+
+  local net = nn.Sequential():add(mlp1:get(1))
+                             :add(mlp2:get(1))
+
+  -- clone the second MLP to ensure that the weights before calling getParameters are preserved
+  mlp2 = mlp2:clone()
+
+  local p, _ = net:getParameters()
+
+  mytester:asserteq((p[{ {1,100} }] - net.modules[1].weight):norm(), 0, 'error when using partial realloc')
+  mytester:asserteq((p[{ {111,210} }] - net.modules[2].weight):norm(), 0, 'error when using partial realloc')
+  -- check that the weights have the same values as before get Parameters was called
+  mytester:asserteq((net.modules[1].weight - mlp1.modules[1].weight):norm(), 0, ' error when using partial realloc')
+  mytester:asserteq((net.modules[2].weight - mlp2.modules[1].weight):norm(), 0, ' error when using partial realloc')
+
+end
+
+function nntest.Module_getParameters_10()
+   -- tensors are non-contiguous but compact; they can be gathered
+   local L = nn.Linear(10,10)
+   L.weight = torch.Tensor(10,10):t():fill(1)
+   local tmp = torch.Tensor(10,10):fill(2)
+   L.bias = tmp:select(1,2)
+   local P = L:getParameters()
+   mytester:asserteq(L.weight:mean(), 1)
+   mytester:asserteq(L.bias:mean(), 2)
+   mytester:asserteq(L.weight:storage(), L.bias:storage())
+   mytester:asserteq(P:nElement(), 110)
+   mytester:asserteq(P:storage():size(), 110)
+   mytester:assertlt(L.bias[{ {10} }]:storageOffset() - 1, L.bias:storage():size())
+end
+
+function nntest.Module_getParameters_11()
+   -- tensors are non-compact; they can't be gathered
+   local L = nn.Linear(10,10)
+   local tmp = torch.Tensor(10,10):fill(2)
+   L.bias = tmp:select(2,2)
+   local ok, err = pcall(L.getParameters, L)
+   mytester:assert(not ok)
+end
+
+function nntest.Module_getParameters_12()
+   -- tensors are expanded (i.e. have dimension 0)
+   local L = nn.Linear(10,10)
+   L.weight = torch.Tensor(10, 1):fill(1)
+   torch.expand(L.weight, 10, 10)
+   L.gradWeight = torch.Tensor(10, 1):fill(1)
+   torch.expand(L.gradWeight, 10, 10)
+   L.bias = torch.Tensor(10):fill(2)
+   local P = L:getParameters()
+   mytester:asserteq(L.weight:mean(), 1)
+   mytester:asserteq(L.bias:mean(), 2)
+   mytester:asserteq(L.weight:storage(), L.bias:storage())
+   mytester:asserteq(P:nElement(), 20)
+   mytester:asserteq(P:storage():size(), 20)
+   mytester:assertlt(L.bias[{ {10} }]:storageOffset() - 1, L.bias:storage():size())
+end
+
+function nntest.Module_listModules()
+   local batchSize = 4
+   local inputSize, outputSize = 7, 6
+   local linear = nn.Linear(inputSize, outputSize)
+   local tanh = nn.Tanh()
+   local reshape = nn.Reshape(outputSize/2, 2)
+   local mlp3 = nn.Sequential()
+   mlp3:add(linear)
+   mlp3:add(tanh)
+   mlp3:add(reshape)
+
+   local mlp2 = nn.Sequential()
+   local view = nn.View(outputSize)
+   local linear2 = nn.Linear(outputSize, inputSize)
+   local tanh2 = nn.Tanh()
+   mlp2:add(mlp3)
+   mlp2:add(view)
+   mlp2:add(linear2)
+   mlp2:add(tanh2)
+
+   local concat = nn.ConcatTable()
+   local id = nn.Identity()
+   concat:add(mlp2)
+   concat:add(id)
+   local mlp = nn.Sequential()
+   local add = nn.CAddTable()
+   mlp:add(concat)
+   mlp:add(add)
+
+   local modules2 = {mlp, concat, mlp2, mlp3, linear, tanh, reshape, view, linear2, tanh2, id, add}
+   local modules = mlp:listModules()
+
+   mytester:assert(#modules2 == #modules, 'missing modules error')
+
+   for i,module in ipairs(modules) do
+      mytester:assert(torch.type(module) == torch.type(modules2[i]), 'module error')
+   end
+end
+
+function nntest.PairwiseDistance()
+   -- Note: testJacobian doesn't support table inputs, and rather than re-write
+   -- it so that it does, I'll just use a split table module on the input.
+   -- I assume both SplitTable and Sequential do not have bugs, otherwise this
+   -- test will break.
+   for p = 1,4 do  -- test a few Lp norms
+      -- TEST CASE 1: non-batch input, same code path but includes a resize
+      local ini = math.random(3,5)
+      local input = torch.Tensor(2, ini):zero()
+      local module = nn.Sequential()
+      module:add(nn.SplitTable(1))
+      module:add(nn.PairwiseDistance(p))
+
+      local err = jac.testJacobian(module,input)
+      mytester:assertlt(err, 1e-4, ' error on state ')
+
+      local ferr,berr = jac.testIO(module,input)
+      mytester:asserteq(ferr, 0, torch.typename(module)..' - i/o forward err ')
+      mytester:asserteq(berr, 0, torch.typename(module)..' - i/o backward err ')
+
+      -- Also check that the forward prop result is correct.
+      input = torch.rand(2, ini)
+      err = torch.dist(input:select(1,1), input:select(1,2), p) -
+        module:forward(input)[1]
+      mytester:assertlt(err,precision, ' error on non-batch fprop ')
+
+      -- TEST CASE 2: batch input
+      local inj = math.random(3,5)
+      input = torch.Tensor(2, inj, ini):zero()
+
+      -- (Rebuild the module to avoid correlated tests)
+      module = nn.Sequential()
+      module:add(nn.SplitTable(1))
+      module:add(nn.PairwiseDistance(p))
+
+      err = jac.testJacobian(module,input)
+      mytester:assertlt(err, 1e-4, ' error on state ')
+
+      -- Also check that the forward prop result is correct.
+      -- manually calculate each distance separately
+      local inputa = torch.rand(inj,ini)
+      local inputb = torch.rand(inj,ini)
+      local dist_manual = torch.Tensor(inj)
+      for i=1, inputa:size(1) do
+         dist_manual[i] = torch.dist(inputa:select(1,i), inputb:select(1,i),p)
+      end
+      -- compare the distances to the module's fprop
+      local dist = module:forward(torch.cat(inputa,inputb,1):resize(2,inj,ini))
+      err = dist - dist_manual
+      mytester:assertlt(err:norm(), precision, torch.typename(module) ..
+         ' error on batch fprop ')
+  end
+end
+
+function nntest.Index()
+    local net = nn.Index(1)
+
+    -- test 1D
+    local input = {torch.Tensor{10, 20, 30}, torch.LongTensor{1, 2, 2, 3}}
+    local output = net:forward(input)
+    equal(output, torch.Tensor{10, 20, 20, 30}, "error in 1D forward pass")
+
+    local gradOutput = torch.Tensor{1, 1, 1, 3 }
+    local gradInput = net:backward(input, gradOutput)
+    equal(gradInput[1], torch.Tensor{1, 2, 3}, "error in 1D backward pass")
+
+    -- test 2D
+    local input = {torch.Tensor{{10, 20}, {30, 40}}, torch.LongTensor{1, 1}}
+    local output = net:forward(input)
+    equal(output, torch.Tensor{{10, 20}, {10, 20}}, "error in 2D forward pass")
+
+    local gradOutput = torch.Tensor{{1, 2}, {1, 2}}
+    local gradInput = net:backward(input, gradOutput)
+    equal(gradInput[1], torch.Tensor{{2, 4}, {0, 0}}, "error in 2D backward pass")
+
+    -- test clearState
+    local m = nn.Index(1)
+    local tensor = torch.Tensor(10, 3)
+    local indices = torch.LongTensor{ 2,3,4}
+
+    m:clearState()
+    m:forward({tensor, indices})
+    m:backward({tensor,indices}, torch.rand(3,3))
+
+end
+
+function nntest.Squeeze()
+   local input  = torch.Tensor(2,1,3):zero()
+   local module = nn.Squeeze()
+   equal(module:forward(input), input:squeeze(), "error in forward pass")
+   local output = input:squeeze()
+   equal(module:backward(input, output), input, "error in backward pass")
+
+   -- testing the dimension option:
+   local input  = torch.Tensor(2,1,1,3):zero()
+   local module = nn.Squeeze(2)
+   equal(module:forward(input), input:squeeze(2), "error in forward pass with dimension")
+   local output = input:squeeze(2)
+   equal(module:backward(input, output), input, "error in backward pass with dimension")
+
+   -- with batch
+   local input  = torch.Tensor(2,1,1,3):zero()
+   local module = nn.Squeeze(2, 3)
+   equal(module:forward(input), input:squeeze(3), "error in forward pass with dimension")
+   local output = input:squeeze(3)
+   equal(module:backward(input, output), input, "error in backward pass with dimension")
+
+
+   -- ... of size one
+   local input  = torch.Tensor(1,1,1,3):zero()
+   local module = nn.Squeeze(2, 3)
+   equal(module:forward(input), input:squeeze(3), "error in forward pass with dimension")
+   local output = input:squeeze(3)
+   equal(module:backward(input, output), input, "error in backward pass with dimension")
+end
+
+function nntest.Unsqueeze()
+   local function assertInputOutputSize(inputSize, outputSize, tf)
+      local input = torch.Tensor(table.unpack(inputSize)):zero()
+      local output = torch.Tensor(table.unpack(outputSize)):zero()
+      local gradInput = input:clone()
+      local gradOutput = output:clone()
+      equal(tf:forward(input), output, "error in forward pass")
+      equal(tf:backward(input, gradOutput), gradInput, "error in backward pass")
+   end
+
+   local function test_normal()
+      -- insert dim 1 at head
+      local inputSize, outputSize = {2,3,4}, {1, 2,3,4}
+      local pos = 1
+      assertInputOutputSize(inputSize,outputSize, nn.Unsqueeze(pos))
+
+      -- insert dim 1 at tail
+      local inputSize, outputSize = {2,3,4}, {2,3,4, 1}
+      local pos = 4
+      assertInputOutputSize(inputSize,outputSize, nn.Unsqueeze(pos))
+
+      -- insert dim 1 in between
+      local inputSize, outputSize = {2,3,4}, {2, 1, 3,4}
+      local pos = 2
+      assertInputOutputSize(inputSize,outputSize, nn.Unsqueeze(pos))
+   end
+
+   local function test_batchmode()
+      -- batch mode: insert dim 1 at head
+      local inputSize, outputSize = {5, 2, 3, 4}, {5, 1, 2, 3, 4}
+      local pos = 1
+      local numInputDims = 3
+      assertInputOutputSize(inputSize,outputSize, nn.Unsqueeze(pos, numInputDims))
+
+      -- batch mode: insert dim 1 at tail
+      local inputSize, outputSize = {5, 2, 3, 4}, {5, 2, 3, 4, 1}
+      local pos = 4
+      local numInputDims = 3
+      assertInputOutputSize(inputSize,outputSize, nn.Unsqueeze(pos, numInputDims))
+
+      -- batch mode: insert dim 1 in between
+      local inputSize, outputSize = {5, 2, 3, 4}, {5, 2, 1, 3, 4}
+      local pos = 2
+      local numInputDims = 3
+      assertInputOutputSize(inputSize,outputSize, nn.Unsqueeze(pos, numInputDims))
+   end
+
+   local function test_sizeone()
+      local inputSize, outputSize = {1,1,3,1}, {1,1, 1, 3,1}
+      local pos = 3
+      assertInputOutputSize(inputSize,outputSize, nn.Unsqueeze(pos))
+
+      local inputSize, outputSize = {1,1,3,2}, {1,1,3,2, 1}
+      local pos = 3
+      local numInputDims = 2
+      assertInputOutputSize(inputSize,outputSize, nn.Unsqueeze(pos, numInputDims))
+   end
+
+   local function test_sizestrange()
+      local inputSize, outputSize = {2}, {2,1}
+      local pos = 2
+      assertInputOutputSize(inputSize,outputSize, nn.Unsqueeze(pos))
+
+      local inputSize, outputSize = {1}, {1, 1}
+      local pos = 1
+      assertInputOutputSize(inputSize,outputSize, nn.Unsqueeze(pos))
+   end
+
+   test_normal()
+   test_batchmode()
+   test_sizeone()
+   test_sizestrange()
+end
+
+function nntest.LookupTable()
+   local totalIndex = math.random(6,9)
+   local nIndex = math.random(3,5)
+   local entry_size = math.random(2,5)
+
+   local function dotest(module, input, minval, maxval)
+       local output = module:forward(input)
+       module:backwardUpdate(input, output, 0.1)
+       input:zero()
+
+       -- 1D
+       local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight, minval, maxval)
+       mytester:assertlt(err,precision, '1D error on weight ')
+
+       local err = jac.testJacobianUpdateParameters(module, input, module.weight, minval, maxval)
+       mytester:assertlt(err,precision, '1D error on weight [direct update] ')
+
+       module.gradWeight:zero()
+       for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+          mytester:assertlt(err, precision, string.format(
+                             '1D error on weight [%s]', t))
+       end
+
+       -- 2D
+       local nframe = math.random(2,5)
+       local input = torch.IntTensor(nframe, nIndex):zero()
+
+       local err = jac.testJacobianParameters(module, input, module.weight, module.gradWeight, minval, maxval)
+       mytester:assertlt(err,precision, '2D error on weight ')
+
+       local err = jac.testJacobianUpdateParameters(module, input, module.weight, minval, maxval)
+       mytester:assertlt(err,precision, '2D error on weight [direct update] ')
+
+       module.gradWeight:zero()
+       for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+          mytester:assertlt(err, precision, string.format(
+                             '2D error on weight [%s]', t))
+       end
+
+       -- IO
+       module.gradInput = torch.Tensor(3,4):zero() --fixes an error
+       local ferr,berr = jac.testIO(module,input,minval,maxval)
+       mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+       mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+
+       -- accUpdate
+       module:accUpdateOnly()
+       mytester:assert(not module.gradWeight, 'gradWeight is nil')
+       module:float()
+       local output = module:forward(input)
+       module:backwardUpdate(input, output, 0.1)
+   end
+   -- test without padding
+   local input = torch.randperm(totalIndex):narrow(1,1,nIndex):int()
+   local module = nn.LookupTable(totalIndex, entry_size)
+   dotest(module, input, 1, totalIndex)
+   -- test with padding set to 1, but no padding in inputs
+   local input = torch.randperm(totalIndex):narrow(1,1,nIndex):int()
+   local module = nn.LookupTable(totalIndex, entry_size, 1)
+   dotest(module, input, 2, totalIndex)
+   -- test whether padding weights remain unchanged
+   local paddingValue = math.random(totalIndex)
+   local module = nn.LookupTable(totalIndex, entry_size, paddingValue)
+   local padw = module.weight:select(1,paddingValue):fill(1)
+   local padw_sum = padw:sum()
+   local input = torch.IntTensor(nIndex)
+   for i = 1, 100 do
+       input:apply(
+       function() -- set randomly half of the input as padding
+           if torch.random(2) == 1 then return paddingValue end
+           return torch.random(totalIndex)
+       end)
+       local y = module:updateOutput(input)
+       module:updateGradInput(input, y)
+       module:accUpdateGradParameters(input, y, 0.1)
+   end
+   local err = padw_sum - padw:sum()
+   mytester:assertlt(err,precision, 'padding update error ')
+   -- test whether the weights changes accordingly when maxNorm is not nil
+   local all_index = torch.randperm(totalIndex):int()
+   -- input can have duplicates
+   local input = torch.repeatTensor(all_index:narrow(1,1,nIndex), 2)
+   local maxNorm = math.random()
+   for _, normType in ipairs{1, 2, math.random()} do
+      local module = nn.LookupTable(totalIndex, entry_size, 0, maxNorm, normType)
+      local oriW = module.weight:clone()
+      local output = module:updateOutput(input)
+      -- check output is of small norm
+      for j = 1,output:size(1) do
+         local norm = torch.norm(output:select(1, j), normType)
+         if norm > maxNorm then
+            local err = norm - maxNorm;
+            mytester:assertlt(math.abs(err), precision, string.format(
+               'output after renorm exceeds maxNorm=[%f] with normType=[%f]', maxNorm, normType))
+         end
+      end
+      -- check the update of the module.weight
+      for j = 1,totalIndex do
+         local k = all_index[j]
+         if j <= nIndex then -- k is an index in "input"
+            local norm = torch.norm(module.weight:select(1, k), normType)
+            local oriNorm = torch.norm(oriW:select(1, k), normType)
+            if oriNorm > maxNorm then
+               local err = norm - maxNorm
+               mytester:assertlt(math.abs(err), precision, 'unexpected norm after renorm')
+            else
+               local err = norm - oriNorm
+               mytester:assertlt(math.abs(err), precision, 'unpexpected norm after renorm')
+            end
+         else -- k is not an index in "input"
+            local err = module.weight:select(1,k):sum() - oriW:select(1,k):sum()
+            mytester:assertlt(math.abs(err), precision, 'unexpected changes in weight after renorm')
+         end
+      end
+   end
+end
+
+function nntest.AddConstant()
+  local nbatch = torch.random(3, 5)
+  local f = torch.random(3, 5)
+  local h = torch.random(7,9)
+  local w = torch.random(7,9)
+  local input = torch.rand(nbatch, f, h, w):mul(20):add(-10)  -- [-10, 10]
+
+  local constant = torch.randn(1):squeeze()
+  local mod = nn.AddConstant(constant)
+
+  -- Test FPROP
+  local output = mod:forward(input)
+  local delta = output - input
+  mytester:assertlt(delta:add(-constant):abs():max(), precision, 'fprop error')
+
+  -- Test BPROP
+  local err = jac.testJacobian(mod, input)
+  mytester:assertlt(err, precision, 'bprop error ')
+
+  -- inplace comparisons
+  local ini = math.random(3,5)
+  local inj = math.random(3,5)
+  local ink = math.random(3,5)
+  local constant = torch.uniform()*math.random(1,10)
+
+  local input1 = torch.rand(ink, inj, ini)
+  local input2 = input1:clone()
+
+  local module1 = nn.AddConstant(constant,true)
+  local module2 = nn.AddConstant(constant)
+
+  local gradOutput1 = torch.rand(ink, inj, ini)
+  local gradOutput2 = gradOutput1:clone()
+
+  local out1 = module1:forward(input1)
+  local out2 = module2:forward(input2)
+
+  mytester:asserteq(0, (out1-out2):abs():max(), torch.typename(module1) ..
+                    ' - in-place forward err ')
+
+  local gradInput1 = module1:backward(input1, gradOutput1)
+  local gradInput2 = module2:backward(input2, gradOutput2)
+
+  mytester:asserteq(0, (gradInput1-gradInput2):abs():max(),
+                torch.typename(module1) .. ' - in-place backward err ')
+
+  local input1 = torch.rand(ink, inj, ini)
+  local input2 = input1:clone()
+
+  module1:forward(input1)
+  module1:backward(module1.output,torch.rand(input1:size()))
+
+  local err = (input1-input2):abs():max()
+  mytester:asserteq(err, 0, torch.typename(module1) ..
+                          ' - inplace input change err ')
+
+  local module3 = nn.AddConstant(torch.Tensor{1,2,3})
+  local out3 = module3:forward(torch.Tensor{-1,-2,-3})
+  mytester:asserteq(0, out3:abs():max(), torch.typename(module3) ..
+                      ' - tensor constant forward err ')
+  local module4 = nn.AddConstant(torch.Tensor{1,2,3})
+  local out4 = module3:forward(torch.Tensor{{-1,-2,-3},{-1,-2,-3}})
+  mytester:asserteq(0, out4:abs():max(), torch.typename(module4) ..
+                      ' - batch tensor constant forward err ')
+end
+
+function nntest.MulConstant()
+  local nbatch = torch.random(3, 5)
+  local f = torch.random(3, 5)
+  local h = torch.random(7,9)
+  local w = torch.random(7,9)
+  local input = torch.rand(nbatch, f, h, w):mul(20):add(-10)  -- [-10, 10]
+
+  local constant = torch.randn(1):squeeze()
+  local mod = nn.MulConstant(constant)
+
+  -- Test FPROP
+  local output = mod:forward(input)
+  local scale = output:clone():cdiv(input)
+  mytester:assertlt(scale:add(-constant):abs():max(), precision, 'fprop error')
+
+  -- Test BPROP
+  local err = jac.testJacobian(mod, input)
+  mytester:assertlt(err, precision, 'bprop error ')
+
+  -- inplace comparisons
+  local ini = math.random(3,5)
+  local inj = math.random(3,5)
+  local ink = math.random(3,5)
+  local constant = torch.uniform()*math.random(1,10)
+
+  local input1 = torch.rand(ink, inj, ini)
+  local input2 = input1:clone()
+
+  local module1 = nn.MulConstant(constant,true)
+  local module2 = nn.MulConstant(constant)
+
+  local gradOutput1 = torch.rand(ink, inj, ini)
+  local gradOutput2 = gradOutput1:clone()
+
+  local out1 = module1:forward(input1)
+  local out2 = module2:forward(input2)
+
+  mytester:asserteq(0, (out1-out2):abs():max(), torch.typename(module1) ..
+                    ' - in-place forward err ')
+
+  local gradInput1 = module1:backward(input1, gradOutput1)
+  local gradInput2 = module2:backward(input2, gradOutput2)
+
+  mytester:asserteq(0, (gradInput1-gradInput2):abs():max(),
+                torch.typename(module1) .. ' - in-place backward err ')
+
+  local input1 = torch.rand(ink, inj, ini)
+  local input2 = input1:clone()
+
+  module1:forward(input1)
+  module1:backward(module1.output,torch.rand(input1:size()))
+
+  local err = (input1-input2):abs():max()
+  mytester:assertalmosteq(err, 0, 1e-15, torch.typename(module1) ..
+                          ' - inplace input change err ')
+end
+
+function nntest.Copy()
+   local input = torch.randn(3,4):double()
+   local c = nn.Copy('torch.DoubleTensor', 'torch.FloatTensor')
+   local output = c:forward(input)
+   mytester:assert(torch.type(output) == 'torch.FloatTensor', 'copy forward type err')
+   mytester:assertTensorEq(output, input:float(), 0.000001, 'copy forward value err')
+   local gradInput = c:backward(input, output)
+   mytester:assert(torch.type(gradInput) == 'torch.DoubleTensor', 'copy backward type err')
+   mytester:assertTensorEq(gradInput, input, 0.000001, 'copy backward value err')
+   c.dontCast = true
+   c:double()
+   mytester:assert(torch.type(output) == 'torch.FloatTensor', 'copy forward type err')
+end
+
+function nntest.CMaxTable()
+   local input1 = torch.Tensor{{1,3},{2,4}}
+   local input2 = torch.Tensor{{4,2},{3,1}}
+   local input = {input1, input2}
+   local module = nn.CMaxTable()
+   local err1 = torch.add(module:forward(input), -1, torch.Tensor{{4,3},{3,4}})
+   mytester:assertalmosteq(err1:abs():max(), 0, 1e-15, "CMaxTable forward call")
+   local gradOutputs = torch.Tensor{5,6,7,8}
+   local gradInputs = module:backward(input, gradOutputs)
+   local err2 = torch.add(gradInputs[1], -1, torch.Tensor{{0,6},{0,8}})
+   local err3 = torch.add(gradInputs[2], -1, torch.Tensor{{5,0},{7,0}})
+   mytester:assertalmosteq(err2:abs():max(), 0, 1e-15, "CMaxTable backward call")
+   mytester:assertalmosteq(err3:abs():max(), 0, 1e-15, "CMaxTable backward call")
+end
+
+function nntest.CMinTable()
+   local input1 = torch.Tensor{{1,3},{2,4}}
+   local input2 = torch.Tensor{{4,2},{3,1}}
+   local input = {input1, input2}
+   local module = nn.CMinTable()
+   local err1 = torch.add(module:forward(input), -1, torch.Tensor{{1,2},{2,1}})
+   mytester:assertalmosteq(err1:abs():max(), 0, 1e-15, "CMinTable forward call")
+   local gradOutputs = torch.Tensor{5,6,7,8}
+   local gradInputs = module:backward(input, gradOutputs)
+   local err2 = torch.add(gradInputs[1], -1, torch.Tensor{{5,0},{7,0}})
+   local err3 = torch.add(gradInputs[2], -1, torch.Tensor{{0,6},{0,8}})
+   mytester:assertalmosteq(err2:abs():max(), 0, 1e-15, "CMinTable backward call")
+   mytester:assertalmosteq(err3:abs():max(), 0, 1e-15, "CMinTable backward call")
+end
+
+function nntest.JoinTable()
+   local tensor = torch.rand(3,4,5)
+   local input = {tensor, tensor}
+   local module
+   for d = 1,tensor:dim() do
+      module = nn.JoinTable(d)
+      mytester:asserteq(module:forward(input):size(d), tensor:size(d)*2, "dimension " .. d)
+   end
+
+   -- Minibatch
+   local tensor = torch.rand(3,4,5)
+   local input = {tensor, tensor}
+   local module
+   for d = 1,tensor:dim()-1 do
+      module = nn.JoinTable(d, 2)
+      mytester:asserteq(module:forward(input):size(d+1), tensor:size(d+1)*2, "dimension " .. d)
+   end
+end
+
+function nntest.SplitTable()
+   local input = torch.randn(3,4,5)
+   local module
+   for d = 1,input:dim() do
+      module = nn.SplitTable(d)
+      mytester:asserteq(#module:forward(input), input:size(d), "dimension " .. d)
+   end
+
+   -- Minibatch
+   local input = torch.randn(3,4,5)
+   local module
+   for d = 1,input:dim()-1 do
+      module = nn.SplitTable(d, 2)
+      mytester:asserteq(#module:forward(input), input:size(d+1), "dimension " .. d)
+   end
+
+   -- Negative indices
+   local module = nn.SplitTable(-3)
+   local input = torch.randn(3,4,5)
+   mytester:asserteq(#module:forward(input), 3, "negative index")
+   local input = torch.randn(2,3,4,5)
+   mytester:asserteq(#module:forward(input), 3, "negative index (minibatch)")
+end
+
+function nntest.Select()
+  -- Test negative Select
+  local input = torch.Tensor{{4,6,7}, {8,0,1}}
+  mytester:asserteq(nn.Select(1,-1):forward(input)[1], 8, "negative index")
+  mytester:asserteq(nn.Select(1,-1):forward(input)[2], 0, "negative index")
+  mytester:asserteq(nn.Select(1,-2):forward(input)[2], 6, "negative index")
+  mytester:asserteq(nn.Select(-1,-1):forward(input)[1], 7, "negative dim + negative index")
+  mytester:asserteq(nn.Select(-1,-1):forward(input)[2], 1, "negative dim + negative index")
+end
+
+function nntest.SelectTable()
+   local input = {
+      torch.rand(3,4,5), torch.rand(3,4,5),
+      {torch.rand(3,4,5)},
+      {torch.rand(3,4,5), {torch.rand(3,4,5)}}
+   }
+   local gradOutputs = {
+      torch.rand(3,4,5), torch.rand(3,4,5),
+      {torch.rand(3,4,5)},
+      {torch.rand(3,4,5), {torch.rand(3,4,5)}}
+   }
+   local zeros = {
+      torch.Tensor(3,4,5):zero(), torch.Tensor(3,4,5):zero(),
+      {torch.Tensor(3,4,5):zero()},
+      {torch.Tensor(3,4,5):zero(), {torch.Tensor(3,4,5):zero()}}
+   }
+   local nonIdx = {2,3,4,1}
+   local module
+   for idx = 1,#input do
+      module = nn.SelectTable(idx)
+      local output = module:forward(input)
+      equal(output, input[idx], "output dimension " .. idx)
+      local gradInput = module:backward(input, gradOutputs[idx])
+      equal(gradInput[idx], gradOutputs[idx], "gradInput[idx] dimension " .. idx)
+      equal(gradInput[nonIdx[idx]], zeros[nonIdx[idx]], "gradInput[nonIdx] dimension " .. idx)
+   end
+
+   -- test negative index
+   local idx = -2
+   module = nn.SelectTable(idx)
+   local output = module:forward(input)
+   equal(output, input[#input+idx+1], "output dimension " .. idx)
+   local gradInput = module:backward(input, gradOutputs[#input+idx+1])
+   equal(gradInput[#input+idx+1], gradOutputs[#input+idx+1], "gradInput[idx] dimension " .. idx)
+   equal(gradInput[nonIdx[#input+idx+1]], zeros[nonIdx[#input+idx+1]], "gradInput[nonIdx] dimension " .. idx)
+
+   -- test typecast
+   local idx = #input
+   module = nn.SelectTable(idx)
+   module:float()
+   local output = module:forward(input)
+   equal(output, input[idx], "type output")
+   local gradInput = module:backward(input, gradOutputs[idx])
+   equal(gradInput[idx], gradOutputs[idx], "gradInput[idx] dimension " .. idx)
+   equal(gradInput[nonIdx[idx]], zeros[nonIdx[idx]], "gradInput[nonIdx] dimension " .. idx)
+
+   -- test on differently sized sub-input tables given consequetively
+   local input1 = {
+      torch.rand(3,4,5),
+      {torch.rand(3,4,5), torch.rand(3,4,5), torch.rand(3,4,5)}
+   }
+   local input2 = {
+      torch.rand(3,4,5),
+      {torch.rand(3,4,5), torch.rand(3,4,5)}
+   }
+
+   module = nn.SelectTable(1)
+   local output = module:forward(input1)
+   equal(output, input1[1], "output dimension 1")
+   local gradInput = module:backward(input1, output)
+   mytester:assert(#gradInput == #input1, "Table lengths")
+   mytester:assert(#gradInput[2] == #input1[2], "Sub-Table lengths")
+   output = module:forward(input2)
+   equal(output, input2[1], "output dimension 1")
+   gradInput = module:backward(input2, output)
+   mytester:assert(#gradInput == #input2, "Table lengths")
+   mytester:assert(#gradInput[2] == #input2[2], "Sub-Table lengths")
+
+   -- test on tables of increasing size
+   local input1 = {torch.rand(3,4,5), torch.rand(3,4,5)}
+   local input2 = {torch.rand(3,4,5), torch.rand(3,4,5), torch.rand(3,4,5)}
+   local gradOutput1 = torch.randn(3,4,5)
+   local gradOutput2 = torch.randn(3,4,5)
+
+   local module1 = nn.SelectTable(-1)
+   local output1 = module1:forward(input1):clone()
+   local output2 = module1:forward(input2)
+   local gradInput_ = module1:backward(input1, gradOutput1)
+   local gradInput1 = {}
+   for k,v in ipairs(gradInput_) do gradInput1[k] = v:clone() end
+   local gradInput2 = module1:backward(input2, gradOutput2)
+
+   local module3 = nn.SelectTable(-1)
+   local module4 = nn.SelectTable(-1)
+   local output3 = module3:forward(input1)
+   local output4 = module4:forward(input2)
+   local gradInput3 = module3:backward(input1, gradOutput1)
+   local gradInput4 = module4:backward(input2, gradOutput2)
+
+   equal(output1, output3, "output 1 and 3")
+   equal(output2, output4, "output 2 and 4")
+   equal(gradInput1, gradInput3, "gradInput 1 and 3")
+   equal(gradInput2, gradInput4, "gradInput 2 and 4")
+end
+
+function nntest.MixtureTable()
+   -- 2D
+   -- expertInput is a Table:
+   local expertInput = torch.randn(5,3,6)
+   local gradOutput = torch.randn(5,6)
+   local input = {
+      torch.rand(5,3),
+      {expertInput:select(2,1), expertInput:select(2,2), expertInput:select(2,3)}
+   }
+   local module = nn.MixtureTable()
+   local output = module:forward(input)
+   local output2 = torch.cmul(input[1]:view(5,3,1):expand(5,3,6), expertInput):sum(2):squeeze(2)
+   mytester:assertTensorEq(output, output2, 0.000001, "mixture output")
+   local gradInput = module:backward(input, gradOutput)
+   local gradOutput2 = torch.view(gradOutput, 5, 1, 6):expandAs(expertInput)
+   local gaterGradInput2 = torch.cmul(gradOutput2, expertInput):sum(3):select(3,1)
+   mytester:assertTensorEq(gradInput[1], gaterGradInput2, 0.000001, "mixture gater gradInput")
+   local expertGradInput2 = torch.cmul(input[1]:view(5,3,1):expand(5,3,6), gradOutput:view(5,1,6):expand(5,3,6))
+   for i, expertGradInput in ipairs(gradInput[2]) do
+      mytester:assertTensorEq(expertGradInput, expertGradInput2:select(2,i), 0.000001, "mixture expert "..i.." gradInput")
+   end
+   -- expertInput is a Tensor:
+   local input = {input[1], expertInput}
+   local module = nn.MixtureTable(2)
+   local output = module:forward(input)
+   mytester:assertTensorEq(output, output2, 0.000001, "mixture2 output")
+   local gradInput = module:backward(input, gradOutput)
+   mytester:assertTensorEq(gradInput[1], gaterGradInput2, 0.000001, "mixture2 gater gradInput")
+   mytester:assertTensorEq(gradInput[2], expertGradInput2, 0.000001, "mixture2 expert gradInput")
+
+   -- 3D
+   local expertInput = torch.randn(5,6,3,2)
+   local gradOutput = torch.randn(5,6,2)
+   -- expertInput is a Table:
+   local input = {
+      torch.rand(5,3),
+      {expertInput:select(3,1), expertInput:select(3,2), expertInput:select(3,3)}
+   }
+   local module = nn.MixtureTable()
+   local output = module:forward(input)
+   local output2 = torch.cmul(input[1]:view(5,1,3,1):expand(5,6,3,2), expertInput):sum(3):squeeze(3)
+   mytester:assertTensorEq(output, output2, 0.000001, "mixture3 output")
+   local gradInput = module:backward(input, gradOutput)
+   local gradOutput2 = torch.view(gradOutput,5,6,1,2):expandAs(expertInput)
+   local gaterGradInput2 = torch.cmul(gradOutput2, expertInput):sum(4):select(4,1):sum(2):select(2,1)
+   mytester:assertTensorEq(gradInput[1], gaterGradInput2, 0.000001, "mixture3 gater gradInput")
+   local expertGradInput2 = torch.cmul(input[1]:view(5,1,3,1):expand(5,6,3,2), gradOutput2)
+   for i, expertGradInput in ipairs(gradInput[2]) do
+      mytester:assertTensorEq(expertGradInput, expertGradInput2:select(3,i), 0.000001, "mixture3 expert "..i.." gradInput")
+   end
+   -- expertInput is a Tensor
+   local input = {input[1], expertInput}
+   local module = nn.MixtureTable(3)
+   local output = module:forward(input)
+   mytester:assertTensorEq(output, output2, 0.000001, "mixture4 output")
+   local gradInput = module:backward(input, gradOutput)
+   mytester:assertTensorEq(gradInput[1], gaterGradInput2, 0.000001, "mixture4 gater gradInput")
+   mytester:assertTensorEq(gradInput[2], expertGradInput2, 0.000001, "mixture4 expert gradInput")
+
+   -- 1D
+   -- expertInput is a Table:
+   local expertInput = torch.randn(3,6)
+   local gradOutput = torch.randn(6)
+   local input = {
+      torch.rand(3),
+      {expertInput:select(1,1), expertInput:select(1,2), expertInput:select(1,3)}
+   }
+   local module = nn.MixtureTable()
+   local output = module:forward(input)
+   local output2 = torch.cmul(input[1]:view(3,1):expand(3,6), expertInput):sum(1):squeeze(1)
+   mytester:assertTensorEq(output, output2, 0.000001, "mixture5 output")
+   local gradInput = module:backward(input, gradOutput)
+   local gradOutput2 = torch.view(gradOutput, 1, 6):expandAs(expertInput)
+   local gaterGradInput2 = torch.cmul(gradOutput2, expertInput):sum(2):select(2,1)
+   mytester:assertTensorEq(gradInput[1], gaterGradInput2, 0.000001, "mixture5 gater gradInput")
+   local expertGradInput2 = torch.cmul(input[1]:view(3,1):expand(3,6), gradOutput:view(1,6):expand(3,6))
+   for i, expertGradInput in ipairs(gradInput[2]) do
+      mytester:assertTensorEq(expertGradInput, expertGradInput2:select(1,i), 0.000001, "mixture5 expert "..i.." gradInput")
+   end
+   -- test type-cast
+   module:float()
+   local input2 = {
+      input[1]:float(),
+      {input[2][1]:float(), input[2][2]:float(), input[2][3]:float()}
+   }
+   local output = module:forward(input2)
+   mytester:assertTensorEq(output, output2:float(), 0.000001, "mixture5B output")
+   local gradInput = module:backward(input2, gradOutput:float())
+   mytester:assertTensorEq(gradInput[1], gaterGradInput2:float(), 0.000001, "mixture5B gater gradInput")
+   for i, expertGradInput in ipairs(gradInput[2]) do
+      mytester:assertTensorEq(expertGradInput, expertGradInput2:select(1,i):float(), 0.000001, "mixture5B expert "..i.." gradInput")
+   end
+   -- expertInput is a Tensor:
+   local input = {input[1], expertInput}
+   local module = nn.MixtureTable(1)
+   local output = module:forward(input)
+   mytester:assertTensorEq(output, output2, 0.000001, "mixture6 output")
+   local gradInput = module:backward(input, gradOutput)
+   mytester:assertTensorEq(gradInput[1], gaterGradInput2, 0.000001, "mixture6 gater gradInput")
+   mytester:assertTensorEq(gradInput[2], expertGradInput2, 0.000001, "mixture6 expert gradInput")
+   -- test type-cast:
+   module:float()
+   local input2 = {input[1]:float(), expertInput:float()}
+   local output = module:forward(input2)
+   mytester:assertTensorEq(output, output2:float(), 0.000001, "mixture6B output")
+   local gradInput = module:backward(input2, gradOutput:float())
+   mytester:assertTensorEq(gradInput[1], gaterGradInput2:float(), 0.000001, "mixture6B gater gradInput")
+   mytester:assertTensorEq(gradInput[2], expertGradInput2:float(), 0.000001, "mixture6B expert gradInput")
+
+   --2D gater, 1D expert
+   -- expertInput is a Table:
+   local expertInput = torch.randn(5,3)
+   local gradOutput = torch.randn(5)
+   local input = {
+      torch.rand(5,3),
+      {expertInput:select(2,1), expertInput:select(2,2), expertInput:select(2,3)}
+   }
+   local module = nn.MixtureTable()
+   local output = module:forward(input)
+   local output2 = torch.cmul(input[1], expertInput):sum(2):squeeze(2)
+   mytester:assertTensorEq(output, output2, 0.000001, "mixture7 output")
+   local gradInput = module:backward(input, gradOutput)
+   local gradOutput2 = torch.view(gradOutput, 5, 1):expandAs(expertInput)
+   local gaterGradInput2 = torch.cmul(gradOutput2, expertInput)
+   mytester:assertTensorEq(gradInput[1], gaterGradInput2, 0.000001, "mixture7 gater gradInput")
+   local expertGradInput2 = torch.cmul(input[1], gradOutput:view(5,1):expand(5,3))
+   for i, expertGradInput in ipairs(gradInput[2]) do
+      mytester:assertTensorEq(expertGradInput, expertGradInput2:select(2,i), 0.000001, "mixture7 expert "..i.." gradInput")
+   end
+end
+
+function nntest.Narrow()
+   -- check basic narrow functionality #1
+   local input = torch.rand(9, 4, 14)
+   local output = input:narrow(1, 3, 5)
+   local gradOutput = torch.rand(5, 4, 14)
+   local gradInput = torch.zeros(9, 4, 14)
+   gradInput:narrow(1, 3, 5):copy(gradOutput)
+   local module1 = nn.Narrow(1, 3, 5)
+   local output1 = module1:forward(input)
+   local gradInput1 = module1:backward(input, gradOutput)
+   local module2 = nn.Narrow(1, 3, -3)
+   local output2 = module2:forward(input)
+   local gradInput2 = module2:backward(input, gradOutput)
+   mytester:assertTensorEq(output, output1, 0.0000001, "Narrow #1 output err")
+   mytester:assertTensorEq(gradInput, gradInput1, 0.00001, "Narrow #1 gradInput err")
+   mytester:assertTensorEq(output, output2, 0.0000001, "Narrow #1 negative output err")
+   mytester:assertTensorEq(gradInput, gradInput2, 0.00001, "Narrow #1 negative gradInput err")
+
+   -- check basic narrow functionality #2
+   local input = torch.rand(3, 10, 4)
+   local output = input:narrow(2, 5, 3)
+   local gradOutput = torch.rand(3, 3, 4)
+   local gradInput = torch.zeros(3, 10, 4)
+   gradInput:narrow(2, 5, 3):copy(gradOutput)
+   local module1 = nn.Narrow(2, 5, 3)
+   local output1 = module1:forward(input)
+   local gradInput1 = module1:backward(input, gradOutput)
+   local module2 = nn.Narrow(2, 5, -4)
+   local output2 = module2:forward(input)
+   local gradInput2 = module2:backward(input, gradOutput)
+   mytester:assertTensorEq(output, output1, 0.0000001, "Narrow #2 output err")
+   mytester:assertTensorEq(gradInput, gradInput1, 0.00001, "Narrow #2 gradInput err")
+   mytester:assertTensorEq(output, output2, 0.0000001, "Narrow #2 negative output err")
+   mytester:assertTensorEq(gradInput, gradInput2, 0.00001, "Narrow #2 negative gradInput err")
+
+   -- check basic narrow functionality #3
+   local input = torch.rand(6, 11, 7)
+   local output = input:narrow(3, 1, 1)
+   local gradOutput = torch.rand(6, 11, 1)
+   local gradInput = torch.zeros(6, 11, 7)
+   gradInput:narrow(3, 1, 1):copy(gradOutput)
+   local module1 = nn.Narrow(3, 1, 1)
+   local output1 = module1:forward(input)
+   local gradInput1 = module1:backward(input, gradOutput)
+   local module2 = nn.Narrow(3, 1, -7)
+   local output2 = module2:forward(input)
+   local gradInput2 = module2:backward(input, gradOutput)
+   mytester:assertTensorEq(output, output1, 0.0000001, "Narrow #3 output err")
+   mytester:assertTensorEq(gradInput, gradInput1, 0.00001, "Narrow #3 gradInput err")
+   mytester:assertTensorEq(output, output2, 0.0000001, "Narrow #3 negative output err")
+   mytester:assertTensorEq(gradInput, gradInput2, 0.00001, "Narrow #3 negative gradInput err")
+
+   -- check basic narrow functionality #4
+   local input = torch.rand(3, 10, 4)
+   local output = input:narrow(2, 5, 3)
+   local gradOutput = torch.rand(3, 3, 4)
+   local gradInput = torch.zeros(3, 10, 4)
+   gradInput:narrow(2, 5, 3):copy(gradOutput)
+   local module1 = nn.Narrow(-2, 5, 3)
+   local output1 = module1:forward(input)
+   local gradInput1 = module1:backward(input, gradOutput)
+   local module2 = nn.Narrow(-2, 5, -4)
+   local output2 = module2:forward(input)
+   local gradInput2 = module2:backward(input, gradOutput)
+   mytester:assertTensorEq(output, output1, 0.0000001, "Narrow #4 output err")
+   mytester:assertTensorEq(gradInput, gradInput1, 0.00001, "Narrow #4 gradInput err")
+   mytester:assertTensorEq(output, output2, 0.0000001, "Narrow #4 negative output err")
+   mytester:assertTensorEq(gradInput, gradInput2, 0.00001, "Narrow #4 negative gradInput err")
+
+   -- check narrow negative offset
+   local input = torch.rand(3, 10, 4)
+   local output = input:narrow(2, 1, 3)
+   local gradOutput = torch.rand(3, 3, 4)
+   local gradInput = torch.zeros(3, 10, 4)
+   gradInput:narrow(2, 1, 3):copy(gradOutput)
+   local module1 = nn.Narrow(2, -1, 7)
+   local output1 = module1:forward(input)
+   local gradInput1 = module1:backward(input, gradOutput)
+   local module2 = nn.Narrow(2, 1, 3)
+   local output2 = module2:forward(input)
+   local gradInput2 = module2:backward(input, gradOutput)
+   mytester:assertTensorEq(output, output1, 0.0000001, "Narrow #5 output err")
+   mytester:assertTensorEq(gradInput, gradInput1, 0.00001, "Narrow #5 gradInput err")
+   mytester:assertTensorEq(output, output2, 0.0000001, "Narrow #5 negative output err")
+   mytester:assertTensorEq(gradInput, gradInput2, 0.00001, "Narrow #5 negative gradInput err")
+end
+
+function nntest.NarrowTable()
+   local input = torch.randn(3,10,4)
+   local gradOutput = torch.randn(3,3,4)
+   local nt = nn.NarrowTable(5,3)
+   local seq = nn.Sequential()
+   seq:add(nn.SplitTable(1,2))
+   seq:add(nt)
+   seq:add(nn.JoinTable(1,1))
+   seq:add(nn.Reshape(3,3,4))
+   local seq2 = nn.Narrow(2,5,3)
+   local output = seq:forward(input)
+   local gradInput = seq:backward(input, gradOutput)
+   local output2 = seq2:forward(input)
+   local gradInput2 = seq2:backward(input, gradOutput)
+   mytester:assertTensorEq(output, output2, 0.0000001, "NarrowTable output err")
+   mytester:assertTensorEq(gradInput, gradInput2, 0.00001, "NarrowTable gradInput err")
+
+   -- now try it with a smaller input
+   local input = input:narrow(2, 1, 8)
+   local output = seq:forward(input)
+   local gradInput = seq:backward(input, gradOutput)
+   local output2 = seq2:forward(input)
+   local gradInput2 = seq2:backward(input, gradOutput)
+   mytester:assertTensorEq(output, output2, 0.0000001, "NarrowTable small output err")
+   mytester:assertTensorEq(gradInput, gradInput2, 0.00001, "NarrowTable small gradInput err")
+
+   -- test type-cast
+   local input = input:float()
+   local gradOutput = gradOutput:float()
+   seq:float()
+   seq2:float()
+   local output = seq:forward(input)
+   local gradInput = seq:backward(input, gradOutput)
+   local output2 = seq2:forward(input)
+   local gradInput2 = seq2:backward(input, gradOutput)
+   mytester:assertTensorEq(output, output2, 0.0000001, "NarrowTable output float err")
+   mytester:assertTensorEq(gradInput, gradInput2, 0.00001, "NarrowTable gradInput float err")
+end
+
+function nntest.View()
+   local input = torch.rand(10)
+   local template = torch.rand(5,2)
+   local target = template:size():totable()
+   local module = nn.View(template:size())
+   mytester:assertTableEq(module:forward(input):size():totable(), target, "Error in forward (1)")
+   local module = nn.View(table.unpack(target))
+   mytester:assertTableEq(module:forward(input):size():totable(), target, "Error in forward (2)")
+
+   -- Minibatch
+   local minibatch = torch.rand(5,10)
+   mytester:asserteq(module:forward(minibatch):size(1),
+      minibatch:size(1),
+      "Error in minibatch dimension")
+   mytester:asserteq(module:forward(minibatch):nElement(),
+      minibatch:nElement(),
+      "Error in minibatch nElement")
+   local module = nn.View(-1):setNumInputDims(1)
+   mytester:asserteq(module:forward(minibatch):size(1),
+      minibatch:size(1),
+      "Error in minibatch dimension with size -1")
+   mytester:asserteq(module:forward(minibatch):nElement(),
+      minibatch:nElement(),
+      "Error in minibatch nElement with size -1")
+
+   -- another setNumInputDims case
+   local minibatch = torch.rand(5,4,10)
+   local module = nn.View(-1):setNumInputDims(2)
+   mytester:asserteq(module:forward(minibatch):size(1),
+      minibatch:size(1),
+      "Error in minibatch dimension with size -1")
+
+   -- another setNumInputDims case
+   local minibatch = torch.rand(2,5,4,10)
+   local module = nn.View(4,-1):setNumInputDims(2)
+   local out = module:forward(minibatch)
+   mytester:asserteq(out:size(1), minibatch:size(1)*minibatch:size(2),
+                          "Error in minibatch dimension with size -1")
+   mytester:asserteq(out:size(2), minibatch:size(3),
+                          "Error in minibatch dimension with size -1")
+   mytester:asserteq(out:size(3), minibatch:size(4),
+                          "Error in minibatch dimension with size -1")
+
+   -- Minibatch Generalization
+   local minibatch = torch.rand(5,2,6)
+   local module = nn.View(6)
+   mytester:asserteq(
+      module:forward(minibatch):size(1),
+      minibatch:size(1)*minibatch:size(2),
+      "Error in minibatch generalization dimension")
+   mytester:asserteq(
+      module:forward(minibatch):nElement(),
+      minibatch:nElement(),
+      "Error in minibatch generalization nElement")
+end
+
+function nntest.Reshape()
+   local input = torch.rand(10)
+   local template = torch.rand(5,2)
+   local target = template:size():totable()
+   local module = nn.Reshape(template:size())
+   mytester:assertTableEq(module:forward(input):size():totable(), target, "Error in forward (1)")
+   local module = nn.View(table.unpack(target))
+   mytester:assertTableEq(module:forward(input):size():totable(), target, "Error in forward (2)")
+
+   -- Minibatch
+   local minibatch = torch.rand(5,10)
+   mytester:asserteq(module:forward(minibatch):size(1),
+      minibatch:size(1),
+      "Error in minibatch dimension")
+   mytester:asserteq(module:forward(minibatch):nElement(),
+      minibatch:nElement(),
+      "Error in minibatch nElement")
+end
+
+-- Define a test for SpatialUpSamplingCuda
+function nntest.SpatialUpSamplingNearest()
+  local scale = torch.random(2,4)
+  for dim = 3,4 do
+    local m = nn.SpatialUpSamplingNearest(scale)
+
+    -- Create a randomly sized dimD vector
+    local shape = {}
+    for i = 1, dim do
+      table.insert(shape, torch.random(2, 2+dim-1))
+    end
+
+    -- Check that the gradient is correct by using finite elements
+    local input = torch.Tensor(table.unpack(shape)):zero()
+
+    local err = jac.testJacobian(m, input)
+    mytester:assertlt(err, precision, ' error on state ')
+
+    local ferr, berr = jac.testIO(m, input)
+    mytester:asserteq(ferr, 0, torch.typename(m)..' - i/o forward err ')
+    mytester:asserteq(berr, 0, torch.typename(m)..' - i/o backward err ')
+  end
+end
+
+function nntest.SpatialUpSamplingBilinear()
+  for scale=2,4 do
+     for dim = 3,4 do
+       local m = nn.SpatialUpSamplingBilinear(scale)
+
+       -- Create a randomly sized dimD vector
+       local shape = {}
+       for i = 1, dim do
+         table.insert(shape, torch.random(2, 2+dim-1))
+       end
+
+       -- Check that the gradient is correct by using finite elements
+       local input = torch.DoubleTensor(table.unpack(shape)):normal()
+
+       local err = jac.testJacobian(m, input)
+       mytester:assertlt(err, precision, ' error on state ')
+
+       local ferr, berr = jac.testIO(m, input)
+       mytester:asserteq(ferr, 0, torch.typename(m)..' - i/o forward err ')
+       mytester:asserteq(berr, 0, torch.typename(m)..' - i/o backward err ')
+   end
+  end
+end
+
+function nntest.Concat()
+   local input = torch.randn(4, 2)
+   local num_modules = math.random(2, 5)
+   local linears = {}
+   for i = 1,num_modules do
+       linears[i] = nn.Linear(2,5)
+   end
+
+   local m = nn.Concat(1)
+   for _,module in ipairs(linears) do
+      m:add(module)
+      module:zeroGradParameters()
+      module.weight:fill(1)
+      module.bias:fill(0)
+   end
+   mytester:asserteq(m:size(), num_modules)
+
+   local output = m:forward(input)
+   local output2 = input:sum(2):expand(4, 5):repeatTensor(num_modules, 1)
+   mytester:assertTensorEq(output2, output, 0.000001, 'Concat forward err')
+
+   local gradInput = m:backward(input, torch.ones(output2:size()))
+   local gradInput2 = torch.ones(4, 2):fill(num_modules * 5)
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001, 'Concat backward err (gradInput)')
+
+   local gradWeight = input:sum(1):expand(5, 2)
+   for _,module in ipairs(linears) do
+      mytester:assertTensorEq(gradWeight, module.gradWeight, 0.000001, 'Concat backward err (gradWeight)')
+   end
+end
+
+function nntest.Parallel()
+   local input = torch.randn(3, 4, 5)
+   local m = nn.Parallel(1,3)
+   m:add(nn.View(4,5,1))
+   m:add(nn.View(4,5,1))
+   m:add(nn.View(4,5,1))
+
+   local output = m:forward(input)
+   local output2 = input:transpose(1,3):transpose(1,2)
+   mytester:assertTensorEq(output2, output, 0.000001, 'Parallel forward err')
+
+   local gradInput = m:backward(input, output2)
+   mytester:assertTensorEq(gradInput, input, 0.000001, 'Parallel backward err')
+end
+
+function nntest.ParallelTable()
+   local input = torch.randn(3, 4, 5)
+   local p = nn.ParallelTable()
+   p:add(nn.View(4,5,1))
+   p:add(nn.View(4,5,1))
+   p:add(nn.View(4,5,1))
+   local m = nn.Sequential()
+   m:add(nn.SplitTable(1))
+   m:add(p)
+   m:add(nn.JoinTable(3))
+
+   local output = m:forward(input)
+   local output2 = input:transpose(1,3):transpose(1,2)
+   mytester:assertTensorEq(output2, output, 0.000001, 'ParallelTable forward err')
+
+   local gradInput = m:backward(input, output2)
+   mytester:assertTensorEq(gradInput, input, 0.000001, 'ParallelTable backward err')
+end
+
+function nntest.ConcatTable()
+   -- Test tensor input
+   local input = torch.rand(5, 5, 5)
+   local m = nn.Sequential()
+
+   local concat = nn.ConcatTable()
+   concat:add(nn.Identity())
+
+   m:add(concat)  -- Output of concat is a table of length 1
+   m:add(nn.JoinTable(1))  -- jac needs a tensor tensor output
+
+   local err = jac.testJacobian(m, input)
+   mytester:assertlt(err, precision, ' error on state ')
+
+   local ferr, berr = jac.testIO(m, input)
+   mytester:asserteq(ferr, 0, torch.typename(m)..' - i/o forward err ')
+   mytester:asserteq(berr, 0, torch.typename(m)..' - i/o backward err ')
+
+   -- Now test a table input
+   local input = {
+      torch.randn(3,4):float(), torch.randn(3,4):float(), {torch.randn(3,4):float()}
+   }
+   local _gradOutput = {
+      torch.randn(3,3,4):float(), torch.randn(3,3,4):float(), torch.randn(3,3,4):float()
+   }
+   local gradOutput = {
+      {_gradOutput[1][1], _gradOutput[2][1], {_gradOutput[3][1]}},
+      {_gradOutput[1][2], _gradOutput[2][2], {_gradOutput[3][2]}},
+      {_gradOutput[1][3], _gradOutput[2][3], {_gradOutput[3][3]}}
+   }
+   local module = nn.ConcatTable()
+   module:add(nn.Identity())
+   module:add(nn.Identity())
+   module:add(nn.Identity())
+   module:float()
+
+   local output = module:forward(input)
+   local output2 = {input, input, input}
+   equal(output2, output, "ConcatTable table output")
+   local gradInput = module:backward(input, gradOutput)
+   local gradInput2 = {_gradOutput[1]:sum(1):squeeze(1), _gradOutput[2]:sum(1):squeeze(1), {_gradOutput[3]:sum(1):squeeze(1)}}
+   equal(gradInput, gradInput2, "ConcatTable table gradInput")
+
+   -- test outputs for variable length inputs
+   local test = nn.ConcatTable()
+   test:add(nn.Identity())
+   test:add(nn.Identity())
+
+   local x = {torch.randn(5), torch.randn(5)}
+   local y = {torch.randn(5)}
+
+   local o1 = #(test:forward(x))
+   local go1 = #(test:backward(x, {x, x}))
+   local o2 = #(test:forward(y))
+   local go2 = #(test:backward(y, {y, y}))
+   mytester:assert(o1 == 2, "ConcatTable table variable length")
+   mytester:assert(go1 == 2, "ConcatTable table variable length")
+   mytester:assert(o2 == 2, "ConcatTable table variable length")
+   mytester:assert(go2 == 1, "ConcatTable table variable length")
+end
+
+function nntest.MapTable()
+   local map = nn.MapTable(nn.Linear(10,5))
+   local lin = map:get(1):clone()
+
+   -- ParalleTable with clones as reference
+   local parallel = nn.ParallelTable()
+   parallel:add(lin)
+   parallel:add(lin:clone('weight','bias'))
+   parallel:add(lin:clone('weight','bias'))
+
+   local input = {torch.rand(10), torch.rand(10), torch.rand(10)}
+   local gradOutput = {torch.ones(5), torch.ones(5), torch.ones(5)}
+
+   local outputM = map:forward(input)
+   local outputP = parallel:forward(input)
+   mytester:assertTensorEq(outputM[1], outputP[1])
+   mytester:assertTensorEq(outputM[2], outputP[2])
+   mytester:assertTensorEq(outputM[3], outputP[3])
+   mytester:assert(map:size() == #input)
+
+   map:zeroGradParameters()
+   parallel:zeroGradParameters()
+   local gradInputM = map:backward(input, gradOutput)
+   local gradInputP = parallel:backward(input, gradOutput)
+   mytester:assertTensorEq(gradInputM[1], gradInputP[1])
+   mytester:assertTensorEq(gradInputM[2], gradInputP[2])
+   mytester:assertTensorEq(gradInputM[3], gradInputP[3])
+
+   map:updateParameters(1)
+   parallel:updateParameters(1)
+   mytester:assertTensorEq(map:get(1).weight, parallel:get(1).weight, 0.00001)
+
+   local output = map:forward({input[1], input[2], input[3], input[3]})
+   mytester:assert(#output == 4)
+   local output = map:forward({input[1], input[2]})
+   mytester:assert(#output == 2)
+
+   map:resize(10)
+   mytester:assert(map:size() == 10)
+   map:resize(4)
+   mytester:assert(map:size() == 4)
+   mytester:assert(torch.pointer(map:get(4).weight:storage())
+      == torch.pointer(map:get(1).weight:storage()))
+   map:clearState()
+   mytester:assert(map:size() == 1)
+
+  -- check if gradients are correctly reset
+  -- share weights and gradients
+  map = nn.MapTable(nn.Linear(10,5))
+  map:forward(input)
+  _, gradParams = map:getParameters()
+  gradParams:uniform()
+  map:zeroGradParameters()
+  mytester:assertlt(gradParams:sum(),precision)
+
+  -- check if gradients are correctly reset
+  -- do not share weights and gradients
+  map = nn.MapTable(nn.Linear(10,5),false)
+  map:forward(input)
+  _, gradParams = map:getParameters()
+  gradParams:uniform()
+  map:zeroGradParameters()
+  mytester:assertlt(gradParams:sum(),precision)
+end
+
+function nntest.FlattenTable()
+   -- Create a nested table.  Obviously we can't even stochastically test
+   -- the space of all possible nested tables (it's infinite), but here is a
+   -- hand-coded one that covers all the cases we need:
+   local input = {
+     torch.rand(1),
+     {
+       torch.rand(2),
+       {
+         torch.rand(3)
+       },
+     },
+     torch.rand(4)
+   }
+   local gradOutput = {
+     torch.rand(1),
+     torch.rand(2),
+     torch.rand(3),
+     torch.rand(4)
+   }
+
+   -- Check the FPROP
+   local m = nn.FlattenTable()
+   local output = m:forward(input)
+   mytester:assert(#output == 4, torch.typename(m)..' - fprop err ')
+   -- This is ugly, but check that the mapping from input to output is correct
+   mytester:assert(output[1] == input[1])
+   mytester:assert(output[2] == input[2][1])
+   mytester:assert(output[3] == input[2][2][1])
+   mytester:assert(output[4] == input[3])
+
+   -- Check the BPROP
+   local gradInput = m:backward(input, gradOutput)
+   -- Again, check that the mapping is correct
+   mytester:assert(gradOutput[1] == gradInput[1])
+   mytester:assert(gradOutput[2] == gradInput[2][1])
+   mytester:assert(gradOutput[3] == gradInput[2][2][1])
+   mytester:assert(gradOutput[4] == gradInput[3])
+
+   -- More uglyness: FlattenTable doesn't rebuild the table every updateOutput
+   -- call, so we need to make sure that modifications to the input are
+   -- detected correctly (and that the table is correctly rebuilt.
+   -- CASE 1: Nothing changes so the output table shouldn't be redefined
+   local old_input_map = m.input_map
+   local old_output = m.output
+   local _ = m:forward(input)
+   mytester:assert(old_input_map == m.input_map and old_output == m.output)
+
+   -- CASE 2: An element is added to the input table
+   old_input_map = m.input_map
+   old_output = m.output
+   input[2][#(input[2])+1] = torch.rand(5)
+   m:forward(input)
+   mytester:assert(old_input_map ~= m.input_map and old_output ~= m.output)
+
+   -- CASE 3: An element is removed from the input table
+   old_input_map = m.input_map
+   old_output = m.output
+   input[#input] = nil
+   m:forward(input)
+   mytester:assert(old_input_map ~= m.input_map and old_output ~= m.output)
+
+   -- At this point further testing is not necessary I think, but just to be
+   -- consistent: perform a jacobian test by using SplitTable and JointTable
+   -- elements
+   m = nn.Sequential()
+   local par = nn.ParallelTable()
+   par:add(nn.SplitTable(1))
+   par:add(nn.SplitTable(1))
+   m:add(nn.SplitTable(1))
+   m:add(par)  -- this will create a nested table
+   m:add(nn.FlattenTable())  -- This will flatten the nested table
+   m:add(nn.JoinTable(1))  -- Finally, this will create a 1D tensor
+
+   input = torch.Tensor(2,2,2)
+   local err = jac.testJacobian(m, input)
+   mytester:assertlt(err, precision, 'error on bprop ')
+end
+
+function nntest.L1Penalty()
+   local weight = 1
+   local sizeAverage = false
+   local m = nn.L1Penalty(weight, sizeAverage, false)
+
+   local input = torch.rand(2,10):add(-0.5)
+   input[1][1] = 0
+
+   local _ = m:forward(input)
+   local grad = m:backward(input, torch.ones(input:size()))
+
+   local err = input:clone():abs():sum()*weight - m.loss
+   mytester:assertlt(math.abs(err), precision, 'error on fprop ')
+
+   local true_grad = (input:gt(0):typeAs(grad) +
+      input:lt(0):typeAs(grad):mul(-1)):mul(weight)
+   mytester:assertlt((true_grad - grad):abs():max(), precision,
+      'error on bprop ')
+
+   -- Note: We cannot use the Jacobian test for this Module since the backward
+   -- gradient cannot be estimated using finite differences (ie, the loss
+   -- during BPROP is not included in the FPROP output)
+end
+
+function nntest.L1Cost()
+   local input = torch.rand(10) * 2 - 1
+   local m = nn.L1Cost()
+   local output = m:forward(input)
+   local err = output - torch.abs(input):sum()
+   mytester:assertalmosteq(err, 0, 1e-15, 'L1Cost forward')
+end
+
+function nntest.DepthConcat()
+   local outputSize = torch.IntTensor{5,6,7,8}
+   local input = torch.randn(2,3,12,12)
+   local gradOutput = torch.randn(2, outputSize:sum(), 12, 12)
+   local concat = nn.DepthConcat(2)
+   concat:add(nn.SpatialConvolutionMM(3, outputSize[1], 1, 1, 1, 1)) --> 2, 5, 12, 12
+   concat:add(nn.SpatialConvolutionMM(3, outputSize[2], 3, 3, 1, 1)) --> 2, 6, 10, 10
+   concat:add(nn.SpatialConvolutionMM(3, outputSize[3], 4, 4, 1, 1)) --> 2, 7, 9, 9
+   concat:add(nn.SpatialConvolutionMM(3, outputSize[4], 5, 5, 1, 1)) --> 2, 8, 8, 8
+   concat:zeroGradParameters()
+   -- forward/backward
+   local outputConcat = concat:forward(input)
+   local gradInputConcat = concat:backward(input, gradOutput)
+   -- the spatial dims are the largest, the nFilters is the sum
+   local output = torch.Tensor(2, outputSize:sum(), 12, 12):zero() -- zero for padding
+   local narrows = { {{},{1,5},{},{}}, {{},{6,11},{2,11},{2,11}}, {{},{12,18},{2,10},{2,10}}, {{},{19,26},{3,10},{3,10}} }
+   local gradInput = input:clone():zero()
+   for i=1,4 do
+      local conv = concat:get(i)
+      local gradWeight = conv.gradWeight:clone()
+      conv:zeroGradParameters()
+      output[narrows[i]]:copy(conv:forward(input))
+      gradInput:add(conv:backward(input, gradOutput[narrows[i]]))
+      mytester:assertTensorEq(gradWeight, conv.gradWeight, 0.000001, "Error in SpatialConcat:accGradParameters for conv "..i)
+   end
+   mytester:assertTensorEq(output, outputConcat, 0.000001, "Error in SpatialConcat:updateOutput")
+   mytester:assertTensorEq(gradInput, gradInputConcat, 0.000001, "Error in SpatialConcat:updateGradInput")
+end
+
+function nntest.MV()
+  local mv = nn.MV(false)
+  local outdim = torch.random(10,20)
+  local indim = torch.random(10,20)
+  local M = torch.randn(outdim, indim)
+  local V = torch.randn(indim)
+
+  -- Test forward pass.
+  local output = mv:forward({M, V})
+  mytester:assertTableEq(output:size():totable(), {outdim},
+  'Output has wrong dimensionality')
+  mytester:assertTensorEq(output, M * V, 1e-10,
+  'Wrong output')
+
+  -- Test backward pass.
+  local gradOutput = torch.randn(outdim)
+  local gradInput = mv:backward({M, V}, gradOutput)
+  mytester:assert(#gradInput == 2, 'gradInput must be table of size 2')
+  local gradM, gradV = table.unpack(gradInput)
+  mytester:assertTableEq(gradM:size():totable(), M:size():totable(),
+  'Gradient for input M has wrong size')
+  mytester:assertTableEq(gradV:size():totable(), V:size():totable(),
+  'Gradient for input V has wrong size')
+  mytester:assertTensorEq(gradM, torch.ger(gradOutput, V), 1e-10,
+  'Wrong gradient for input M')
+  -- d/dV(j) (A(i,j)V(j)) = (
+  mytester:assertTensorEq(gradV, M:t() * gradOutput, 1e-10,
+  'Wrong gradient for input V')
+end
+
+function nntest.BatchMVNoTranspose()
+  local mv = nn.MV()
+  local outdim = torch.random(10,20)
+  local indim = torch.random(10,20)
+  for bSize = 1, 11, 5 do
+    local M = torch.randn(bSize, outdim, indim)
+    local V = torch.randn(bSize, indim)
+
+    -- Test forward pass.
+    local output = mv:forward({M, V})
+    mytester:assertTableEq(output:size():totable(), {bSize, outdim},
+    'Output has wrong dimensionality')
+    for i = 1, bSize do
+      mytester:assertTensorEq(output[i], M[i] * V[i], 1e-10,
+      'Output wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+    end
+
+    -- Test backward pass.
+    local gradOutput = torch.randn(bSize, outdim)
+    local gradInput = mv:backward({M, V}, gradOutput)
+    mytester:assert(#gradInput == 2, 'gradInput must be table of size 2')
+    local gradM, gradV = table.unpack(gradInput)
+    mytester:assertTableEq(gradM:size():totable(), M:size():totable(),
+    'Gradient for input M has wrong size')
+    mytester:assertTableEq(gradV:size():totable(), V:size():totable(),
+    'Gradient for input V has wrong size')
+    for i = 1, bSize do
+      mytester:assertTensorEq(gradM[i], torch.ger(gradOutput[i], V[i]), 1e-10,
+      'Gradient for input M wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+      mytester:assertTensorEq(gradV[i], M[i]:t() * gradOutput[i], 1e-10,
+      'Gradient for input V wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+    end
+  end
+end
+
+function nntest.BatchMVTranspose()
+  local mv = nn.MV(true)
+  local outdim = torch.random(10,20)
+  local indim = torch.random(10,20)
+  for bSize = 1, 11, 5 do
+    local M = torch.randn(bSize, indim, outdim)
+    local V = torch.randn(bSize, indim)
+
+    -- Test forward pass.
+    local output = mv:forward({M, V})
+    mytester:assertTableEq(output:size():totable(), {bSize, outdim},
+    'Output has wrong dimensionality')
+    for i = 1, bSize do
+      mytester:assertTensorEq(output[i], M[i]:t() * V[i], 1e-10,
+      'Output wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+    end
+
+    -- Test backward pass.
+    local gradOutput = torch.randn(bSize, outdim)
+    local gradInput = mv:backward({M, V}, gradOutput)
+    mytester:assert(#gradInput == 2, 'gradInput must be table of size 2')
+    local gradM, gradV = table.unpack(gradInput)
+    mytester:assertTableEq(gradM:size():totable(), M:size():totable(),
+    'Gradient for input M has wrong size')
+    mytester:assertTableEq(gradV:size():totable(), V:size():totable(),
+    'Gradient for input V has wrong size')
+    for i = 1, bSize do
+      mytester:assertTensorEq(gradM[i], torch.ger(V[i], gradOutput[i]), 1e-10,
+      'Gradient for input M wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+      mytester:assertTensorEq(gradV[i], M[i] * gradOutput[i], 1e-10,
+      'Gradient for input V wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+    end
+  end
+end
+
+local function createMatrixInputSizes()
+  local M = torch.random(10, 20)
+  local N = torch.random(10, 20)
+  local P = torch.random(10, 20)
+  return M, N, P
+end
+
+function nntest.MM()
+  local mm = nn.MM(false, true)
+  local M, N, P = createMatrixInputSizes()
+  local A = torch.randn(M, N)
+  local B = torch.randn(P, N)
+
+  -- Test forward pass.
+  local output = mm:forward({A, B})
+  mytester:assertTableEq(output:size():totable(), {M, P},
+                         'Output has wrong dimensionality')
+  mytester:assertTensorEq(output, A * B:t(), 1e-10,
+                          'Wrong output')
+
+  -- Test backward pass.
+  local gradOutput = torch.randn(M, P)
+  local gradInput = mm:backward({A, B}, gradOutput)
+  mytester:assert(#gradInput == 2, 'gradInput must be table of size 2')
+  local gradA, gradB = table.unpack(gradInput)
+  mytester:assertTableEq(gradA:size():totable(), A:size():totable(),
+                         'Gradient for input A has wrong size')
+  mytester:assertTableEq(gradB:size():totable(), B:size():totable(),
+                         'Gradient for input B has wrong size')
+  mytester:assertTensorEq(gradA, gradOutput * B, 1e-10,
+                          'Wrong gradient for input A')
+  mytester:assertTensorEq(gradB, gradOutput:t() * A, 1e-10,
+                          'Wrong gradient for input B')
+end
+
+function nntest.BatchMMNoTranspose()
+  local mm = nn.MM()
+  local M, N, P = createMatrixInputSizes()
+  for bSize = 1, 11, 5 do
+    local A = torch.randn(bSize, M, N)
+    local B = torch.randn(bSize, N, P)
+
+    -- Test forward pass.
+    local output = mm:forward({A, B})
+    mytester:assertTableEq(output:size():totable(), {bSize, M, P},
+                           'Output has wrong dimensionality')
+    for i = 1, bSize do
+      mytester:assertTensorEq(output[i], A[i] * B[i], 1e-10,
+                              'Output wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+    end
+
+    -- Test backward pass.
+    local gradOutput = torch.randn(bSize, M, P)
+    local gradInput = mm:backward({A, B}, gradOutput)
+    mytester:assert(#gradInput == 2, 'gradInput must be table of size 2')
+    local gradA, gradB = table.unpack(gradInput)
+    mytester:assertTableEq(gradA:size():totable(), A:size():totable(),
+                           'Gradient for input A has wrong size')
+    mytester:assertTableEq(gradB:size():totable(), B:size():totable(),
+                           'Gradient for input B has wrong size')
+    for i = 1, bSize do
+      mytester:assertTensorEq(gradA[i], gradOutput[i] * B[i]:t(), 1e-10,
+                              'Gradient for input A wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+      mytester:assertTensorEq(gradB[i], A[i]:t() * gradOutput[i], 1e-10,
+                              'Gradient for input B wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+    end
+  end
+end
+
+function nntest.BatchMMTransposeA()
+  local mm = nn.MM(true, false)
+  local M, N, P = createMatrixInputSizes()
+  for bSize = 1, 11, 5 do
+    local A = torch.randn(bSize, N, M)
+    local B = torch.randn(bSize, N, P)
+
+    -- Test forward pass.
+    local output = mm:forward({A, B})
+    mytester:assertTableEq(output:size():totable(), {bSize, M, P},
+                           'Output has wrong dimensionality')
+    for i = 1, bSize do
+      mytester:assertTensorEq(output[i], A[i]:t() * B[i], 1e-10,
+                              'Output wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+    end
+
+    -- Test backward pass.
+    local gradOutput = torch.randn(bSize, M, P)
+    local gradInput = mm:backward({A, B}, gradOutput)
+    mytester:assert(#gradInput == 2, 'gradInput must be table of size 2')
+    local gradA, gradB = table.unpack(gradInput)
+    mytester:assertTableEq(gradA:size():totable(), A:size():totable(),
+                           'Gradient for input A has wrong size')
+    mytester:assertTableEq(gradB:size():totable(), B:size():totable(),
+                           'Gradient for input B has wrong size')
+    for i = 1, bSize do
+      mytester:assertTensorEq(gradA[i], B[i] * gradOutput[i]:t(), 1e-10,
+                              'Gradient for input A wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+      mytester:assertTensorEq(gradB[i], A[i] * gradOutput[i], 1e-10,
+                              'Gradient for input B wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+    end
+  end
+end
+
+function nntest.BatchMMTransposeB()
+  local mm = nn.MM(false, true)
+  local M, N, P = createMatrixInputSizes()
+  for bSize = 1, 11, 5 do
+    local A = torch.randn(bSize, M, N)
+    local B = torch.randn(bSize, P, N)
+
+    -- Test forward pass.
+    local output = mm:forward({A, B})
+    mytester:assertTableEq(output:size():totable(), {bSize, M, P},
+                           'Output has wrong dimensionality')
+    for i = 1, bSize do
+      mytester:assertTensorEq(output[i], A[i] * B[i]:t(), 1e-10,
+                              'Output wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+    end
+
+    -- Test backward pass.
+    local gradOutput = torch.randn(bSize, M, P)
+    local gradInput = mm:backward({A, B}, gradOutput)
+    mytester:assert(#gradInput == 2, 'gradInput must be table of size 2')
+    local gradA, gradB = table.unpack(gradInput)
+    mytester:assertTableEq(gradA:size():totable(), A:size():totable(),
+                           'Gradient for input A has wrong size')
+    mytester:assertTableEq(gradB:size():totable(), B:size():totable(),
+                           'Gradient for input B has wrong size')
+    for i = 1, bSize do
+      mytester:assertTensorEq(gradA[i], gradOutput[i] * B[i], 1e-10,
+                              'Gradient for input A wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+      mytester:assertTensorEq(gradB[i], gradOutput[i]:t() * A[i], 1e-10,
+                              'Gradient for input B wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+    end
+  end
+end
+
+function nntest.BatchMMTransposeBoth()
+  local mm = nn.MM(true, true)
+  local M, N, P = createMatrixInputSizes()
+  for bSize = 1, 11, 5 do
+    local A = torch.randn(bSize, N, M)
+    local B = torch.randn(bSize, P, N)
+
+    -- Test forward pass.
+    local output = mm:forward({A, B})
+    mytester:assertTableEq(output:size():totable(), {bSize, M, P},
+                           'Output has wrong dimensionality')
+    for i = 1, bSize do
+      mytester:assertTensorEq(output[i], A[i]:t() * B[i]:t(), 1e-10,
+                              'Output wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+    end
+
+    -- Test backward pass.
+    local gradOutput = torch.randn(bSize, M, P)
+    local gradInput = mm:backward({A, B}, gradOutput)
+    mytester:assert(#gradInput == 2, 'gradInput must be table of size 2')
+    local gradA, gradB = table.unpack(gradInput)
+    mytester:assertTableEq(gradA:size():totable(), A:size():totable(),
+                           'Gradient for input A has wrong size')
+    mytester:assertTableEq(gradB:size():totable(), B:size():totable(),
+                           'Gradient for input B has wrong size')
+    for i = 1, bSize do
+      mytester:assertTensorEq(gradA[i], B[i]:t() * gradOutput[i]:t(), 1e-10,
+                              'Gradient for input A wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+      mytester:assertTensorEq(gradB[i], gradOutput[i]:t() * A[i]:t(), 1e-10,
+                              'Gradient for input B wrong for bSize = ' .. bSize .. ' and i = ' .. i)
+    end
+  end
+end
+
+function nntest.DotProduct()
+  local indim = math.random(1,10)
+
+  -- test 1D forward
+  local input = {torch.rand(indim),torch.rand(indim)}
+  local module = nn.DotProduct()
+  local expected = input[1]:dot(input[2])
+  local output = module:forward(input)
+  mytester:assertlt(math.abs(expected-output[1]), precision, 'error on forward ')
+
+  -- check gradients
+  -- Note: testJacobian doesn't support table inputs, and rather than re-write
+  -- it so that it does, I'll just use a split table module on the input.
+  -- I assume both SplitTable and Sequential do not have bugs, otherwise this
+  -- test will break.
+  local input = torch.rand(2,indim)
+  local module = nn.Sequential()
+  module:add(nn.SplitTable(1))
+  module:add(nn.DotProduct())
+
+  local err = jac.testJacobian(module,input)
+  mytester:assertlt(err,precision, 'error on state ')
+
+  -- IO
+  local ferr,berr = jac.testIO(module,input)
+  mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+  mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+
+  -- batch
+  -- rebuild module to avoid correlated tests
+  local module = nn.Sequential()
+  module:add(nn.SplitTable(1))
+  module:add(nn.DotProduct())
+
+  local nframes = math.random(1,10)
+  local indim = math.random(1,10)
+  local input = torch.rand(2,nframes,indim)
+
+  local err = jac.testJacobian(module,input)
+  mytester:assertlt(err,precision, 'batch error on state ')
+end
+
+function nntest.CosineDistance()
+  local indim = math.random(1,10)
+  local input = {torch.rand(indim),torch.rand(indim)}
+
+  -- check forward against previous implementation
+  local module = nn.CosineDistance()
+
+  local w1 = input[1]:dot(input[2])
+  local w2 = math.sqrt(input[1]:dot(input[1]))
+  local w3 = math.sqrt(input[2]:dot(input[2]))
+  local output_old = w1/w2/w3
+
+  local output = module:forward(input)
+
+  mytester:assertlt(math.abs(output_old-output[1]),precision,'error on forward ')
+
+
+  -- check gradients
+  -- Note: testJacobian doesn't support table inputs, and rather than re-write
+  -- it so that it does, I'll just use a split table module on the input.
+  -- I assume both SplitTable and Sequential do not have bugs, otherwise this
+  -- test will break.
+  local input = torch.rand(2,indim)
+  local module = nn.Sequential()
+  module:add(nn.SplitTable(1))
+  module:add(nn.CosineDistance())
+
+  local err = jac.testJacobian(module,input)
+  mytester:assertlt(err,precision, 'error on state ')
+
+  -- IO
+  local ferr,berr = jac.testIO(module,input)
+  mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+  mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+
+  -- batch
+  -- rebuild module to avoid correlated tests
+  local module = nn.Sequential()
+  module:add(nn.SplitTable(1))
+  module:add(nn.CosineDistance())
+
+  local nframes = math.random(1,10)
+  local indim = math.random(1,10)
+  local input = torch.rand(2,nframes,indim)
+
+  local err = jac.testJacobian(module,input)
+  mytester:assertlt(err,precision, 'batch error on state ')
+
+end
+
+function nntest.CosineEmbeddingCriterion()
+  local v1 = torch.Tensor{1, 0}
+  local v2 = torch.Tensor{0.5, math.sqrt(3)*0.5}
+
+  local crit = nn.CosineEmbeddingCriterion(0.6)
+  local output = crit:forward({v1, v2}, -1) -- must be Called before backward
+  local grads = crit:backward({v1, v2}, -1)
+
+  local zero = torch.Tensor(2):zero()
+  equal(grads[1], zero, 'gradient should be zero')
+  equal(grads[2], zero, 'gradient should be zero')
+
+  -- check jacobians
+  local margin = math.random()*2-1
+  local dim = 5
+  local batch_size = 1
+  local crit = nn.CosineEmbeddingCriterion(margin)
+  local v = torch.rand(2,dim)
+  criterionJacobianTest1DTable(crit,v,1)
+  criterionJacobianTest1DTable(crit,v,-1)
+
+  -- batch with hand-computed values
+  local v1 = torch.Tensor{{1, 0}, {0.5, math.sqrt(3)*0.5}}
+  local v2 = torch.Tensor{{0.5, math.sqrt(3)*0.5}, {1, 0}}
+
+  local t = torch.Tensor{-1,-1}
+  local crit = nn.CosineEmbeddingCriterion(0.6)
+  local output = crit:forward({v1, v2}, t) -- must be Called before backward
+  local grads = crit:backward({v1, v2}, t)
+
+  local zero = torch.Tensor(2,2):zero()
+  equal(grads[1], zero, 'gradient should be zero')
+  equal(grads[2], zero, 'gradient should be zero')
+
+  -- batch, sizeAverage true, jacobian
+  local margin = math.random()*2-1
+  local dim = 5
+  local batch_size = 2
+  local crit = nn.CosineEmbeddingCriterion(margin)
+  crit.sizeAverage = true
+  local v = torch.rand(2,batch_size,dim)
+  local t = torch.Tensor(batch_size):random(0,1):mul(2):add(-1)
+  criterionJacobianTest1DTable(crit,v,t)
+
+  -- batch, sizeAverage false, jacobian
+  local margin = math.random()*2-1
+  local crit = nn.CosineEmbeddingCriterion(margin)
+  crit.sizeAverage = false
+  local v = torch.rand(2,batch_size,dim)
+  local t = torch.Tensor(batch_size):random(0,1):mul(2):add(-1)
+  criterionJacobianTest1DTable(crit,v,t)
+end
+
+function nntest.HingeEmbeddingCriterion()
+  local x = torch.Tensor{0.3,2.1,1.8,0}
+  local y = torch.Tensor{1,-1,-1,1}
+  local expgrads = torch.Tensor{1,0,-1,1} / 4
+
+  local crit = nn.HingeEmbeddingCriterion(2)
+  local output = crit:forward(x, y) -- must be called before backward
+  local grads = crit:backward(x, y)
+
+  mytester:assert(math.abs(output - (0.3 + 0.2) / 4) < 1e-10)
+  equal(grads, expgrads)
+end
+
+function nntest.Replicate()
+   local vector = torch.rand(3)
+
+   local r1 = nn.Replicate(2, 1)
+   local r2 = nn.Replicate(2, 2)
+
+   local vOutput1 = r1:forward(vector):clone()
+   local vOutput2 = r2:forward(vector):clone()
+
+   local expected1 = torch.zeros(2, 3)
+   local expected2 = torch.zeros(3, 2)
+   expected1:select(1, 1):copy(vector)
+   expected1:select(1, 2):copy(vector)
+   expected2:select(2, 1):copy(vector)
+   expected2:select(2, 2):copy(vector)
+
+   mytester:assertTensorEq(vOutput1, expected1, precision, 'Wrong tiling of data when replicating vector.')
+   mytester:assertTensorEq(vOutput2, expected2, precision, 'Wrong tiling of data when replicating vector.')
+
+   -- batch mode
+   local vector = torch.rand(4,3)
+
+   local r1 = nn.Replicate(2, 1, 1)
+   local r2 = nn.Replicate(2, 2, 1)
+
+   local vOutput1 = r1:forward(vector):clone()
+   local vOutput2 = r2:forward(vector):clone()
+
+   local expected1 = torch.zeros(4, 2, 3)
+   local expected2 = torch.zeros(4, 3, 2)
+   expected1:select(2, 1):copy(vector)
+   expected1:select(2, 2):copy(vector)
+   expected2:select(3, 1):copy(vector)
+   expected2:select(3, 2):copy(vector)
+
+   mytester:assertTensorEq(vOutput1, expected1, precision, 'Wrong tiling of data when replicating batch vector.')
+   mytester:assertTensorEq(vOutput2, expected2, precision, 'Wrong tiling of data when replicating batch vector.')
+end
+
+local function testBatchNormalization(moduleName, dim, k)
+   local planes = torch.random(1,k)
+   local size = { torch.random(2, k), planes }
+   for i=1,dim do
+      table.insert(size, torch.random(1,k))
+   end
+   local input = torch.zeros(table.unpack(size)):uniform()
+
+   local function jacTests(module, input, affine)
+      local err = jac.testJacobian(module,input)
+      mytester:assertlt(err,precision, 'error on state ')
+
+      if affine then
+         local err = jac.testJacobianParameters(module, input,
+                                            module.weight, module.gradWeight)
+         mytester:assertlt(err,precision, 'error on weight ')
+
+         local err = jac.testJacobianParameters(module, input,
+                                            module.bias, module.gradBias)
+         mytester:assertlt(err,precision, 'error on weight ')
+
+         local err = jac.testJacobianUpdateParameters(module, input, module.weight)
+         mytester:assertlt(err,precision, 'error on weight [direct update] ')
+
+         local err = jac.testJacobianUpdateParameters(module, input, module.bias)
+         mytester:assertlt(err,precision, 'error on bias [direct update] ')
+
+         for t,err in pairs(jac.testAllUpdate(module, input, 'weight', 'gradWeight')) do
+            mytester:assertlt(err, precision, string.format(
+               'error on weight [%s]', t))
+         end
+
+         for t,err in pairs(jac.testAllUpdate(module, input, 'bias', 'gradBias')) do
+            mytester:assertlt(err, precision, string.format('error on bias [%s]', t))
+         end
+      end
+
+      -- IO
+      local ferr,berr = jac.testIO(module,input)
+      mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+      mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+   end
+
+   local module = nn[moduleName](planes)
+   module:training()
+   jacTests(module, input, true)
+   module:evaluate()
+   jacTests(module, input, true)
+   jacTests(module, input[1], true)
+
+   -- batch norm without affine transform
+   module = nn[moduleName](planes, 1e-5, 0.1, false)
+   module:training()
+   jacTests(module, input, false)
+   module:evaluate()
+   jacTests(module, input, false)
+   jacTests(module, input[1], false)
+end
+
+function nntest.BatchNormalization()
+   testBatchNormalization('BatchNormalization', 0, 20)
+end
+
+function nntest.SpatialBatchNormalization()
+   testBatchNormalization('SpatialBatchNormalization', 2, 6)
+end
+
+function nntest.VolumetricBatchNormalization()
+   testBatchNormalization('VolumetricBatchNormalization', 3, 4)
+end
+
+function nntest.GradientReversal()
+   local ini = math.random(3,5)
+   local inj = math.random(3,5)
+   local ink = math.random(3,5)
+   local input = torch.Tensor(ini,inj,ink):zero()
+   -- Two GradientReversal layers should cancel each other out
+   local module = nn.Sequential()
+   module:add(nn.GradientReversal())
+   module:add(nn.GradientReversal())
+
+   local err = jac.testJacobian(module,input, 0.1, 10)
+   mytester:assertlt(err,precision, 'error on state ')
+
+   local ferr,berr = jac.testIO(module,input, 0.1, 10)
+   mytester:eq(ferr, 0, torch.typename(module) .. ' - i/o forward err ', precision)
+   mytester:eq(berr, 0, torch.typename(module) .. ' - i/o backward err ', precision)
+end
+
+function nntest.Padding()
+   local fanin = math.random(1,3)
+   local sizex = math.random(4,16)
+   local sizey = math.random(4,16)
+   local pad = math.random(-3,3)
+   local index = math.random(1, fanin)
+   local val = torch.randn(1):squeeze()
+   local module = nn.Padding(1, pad, 3, val, index)
+   local input = torch.rand(fanin,sizey,sizex)
+   local size = input:size():totable()
+   size[1] = size[1] + math.abs(pad)
+
+   local output = module:forward(input)
+   mytester:assertTableEq(size, output:size():totable(), 0.00001, "Padding size error")
+
+   local gradInput = module:backward(input, output)
+   mytester:assertTensorEq(gradInput, input, 0.00001, "Padding backward error")
+end
+
+function nntest.addSingletonDimension()
+   local dims = torch.random(5)
+   local size = torch.LongTensor(dims):random(10)
+   local perm = torch.randperm(dims):totable()
+   local tensor = torch.Tensor(table.unpack(size:totable())):uniform():permute(table.unpack(perm))
+   size = torch.gather(size, 1, torch.LongTensor(perm))
+
+   local firstDim = nn.utils.addSingletonDimension(tensor)
+   mytester:assertTableEq(firstDim:size():totable(), {1, table.unpack(size:totable())},
+                          "wrong size for singleton dimension 1")
+   mytester:assertTensorEq(firstDim[1], tensor, 0,
+                           "wrong content for singleton dimension 1")
+
+   local dim = torch.random(dims + 1)
+   local result = nn.utils.addSingletonDimension(tensor, dim)
+   local resultSize = size:totable()
+   table.insert(resultSize, dim, 1)
+   mytester:assertTableEq(result:size():totable(), resultSize,
+                          "wrong size for random singleton dimension")
+   mytester:assertTensorEq(result:select(dim, 1), tensor, 0,
+                           "wrong content for random singleton dimension")
+
+   mytester:assertError(function() nn.utils.addSingletonDimension(tensor, dims + 2) end,
+                        "invalid dimension not detected")
+
+   -- passing output tensor as argument
+   local resultArg = torch.Tensor()
+   local resultR = nn.utils.addSingletonDimension(resultArg, tensor, dim)
+   mytester:eq(resultArg:size():totable(), resultSize,
+               'wrong content for random singleton dimension '..
+               'when the result is passed as argument')
+   mytester:eq(resultArg, result, 'wrong content for random singleton dimension '..
+               'when the result is passed as argument')
+
+   mytester:eq(resultR == resultArg, true,
+               'new tensor is created when it should use the provided tensor')
+end
+
+function nntest.SpatialReflectionPadding()
+   local batch = math.random(1,3)
+   local plane = math.random(1,3)
+   local sizeY = math.random(7,16)
+   local sizeX = math.random(7,16)
+   local padL = math.random(-3,3)
+   local padR = math.random(-3,3)
+   local padT = math.random(-3,3)
+   local padB = math.random(-3,3)
+   local jac = nn.Jacobian
+   local layer = nn.SpatialReflectionPadding(padL, padR, padT, padB)
+   local input = torch.rand(batch, plane, sizeY, sizeX)
+   local err = jac.testJacobian(layer, input)
+   mytester:assertalmosteq(err, 0.0, 1e-7)
+end
+
+function nntest.SpatialReplicationPadding()
+   local batch = math.random(1,3)
+   local plane = math.random(1,3)
+   local sizeY = math.random(7,16)
+   local sizeX = math.random(7,16)
+   local padL = math.random(-3,3)
+   local padR = math.random(-3,3)
+   local padT = math.random(-3,3)
+   local padB = math.random(-3,3)
+   local jac = nn.Jacobian
+   local layer = nn.SpatialReplicationPadding(padL, padR, padT, padB)
+   local input = torch.rand(batch, plane, sizeY, sizeX)
+   local err = jac.testJacobian(layer, input)
+   mytester:assertalmosteq(err, 0.0, 1e-7)
+end
+
+function nntest.VolumetricReplicationPadding()
+   for batch = 0, 1 do
+      local nbatch
+      if batch == 1 then
+         nbatch = math.random(1,3)
+      end
+      local plane = math.random(1,3)
+      local sizeZ = math.random(1,4)
+      local sizeY = math.random(7,11)
+      local sizeX = math.random(7,11)
+      local padLeft = math.random(-3,3)
+      local padRight = math.random(-3,3)
+      local padTop = math.random(-3,3)
+      local padBottom = math.random(-3,3)
+      local padFront = math.random(3,3)
+      local padBack = math.random(3,3)
+      local jac = nn.Jacobian
+      local layer =
+          nn.VolumetricReplicationPadding(padLeft, padRight, padTop,
+                                          padBottom, padFront, padBack)
+      local input
+      if batch == 1 then
+         input = torch.rand(nbatch, plane, sizeZ, sizeY, sizeX)
+      else
+         input = torch.rand(plane, sizeZ, sizeY, sizeX)
+      end
+      local err = jac.testJacobian(layer, input)
+      mytester:assertalmosteq(err, 0.0, 1e-7)
+   end
+end
+
+function nntest.PixelShuffle()
+   -- Checks whether a given tensor has the specified size
+   local function tensorHasSize(tensor, size)
+      local tensorSize = tensor:size()
+
+      if tensorSize:size() ~= #size then
+         return false
+      end
+      for i,v in ipairs(size) do
+         if tensorSize[i] ~= size[i] then
+            return false
+         end
+      end
+      return true
+   end
+
+   --Verifies that the output is the input re-shuffled as per Eq 4. in
+   -- "Real-Time Single Image and Video Super-Resolution Using an Efficient
+   -- Sub-Pixel Convolutional Neural Network", Shi et al.
+   -- @param - the input, low-resolution image of shape [1, c, h , w]
+   -- @param - the output, super resolved image of shape [1, c, h ,w]
+   -- @param - upscale factor of the super resolutin
+   -- @returns true if output complies with Eq 4.
+   local function verifyPixelShuffle(_input, _output, upscaleFactor)
+      local input = _input
+      local output = _output
+
+      if input:nDimension() == 3 then
+         input = input:view(1, input:size(1), input:size(2), input:size(3))
+         output = output:view(1, output:size(1), output:size(2), output:size(3))
+      end
+
+      for c = 1, output:size(2)  do
+         for h = 1, output:size(3) do
+            for w = 1, output:size(4) do
+               local heightIdx = torch.floor((h - 1)/upscaleFactor) + 1
+               local widthIdx = torch.floor((w - 1)/upscaleFactor) + 1
+                  --c does not need to be (c - 1) as it starts at 1 not zero
+                  local channelIdx = upscaleFactor * ((h-1) % upscaleFactor) + ((w-1) % upscaleFactor) + 1 + (c-1)*upscaleFactor*upscaleFactor
+
+                  mytester:assertTensorEq(output[{{}, {c}, {h}, {w}}], input[{{}, {channelIdx}, {heightIdx}, {widthIdx}}],
+                                        string.format("output at location (%d, %d, %d) is incorrect", c, h, w))
+            end
+         end
+      end
+      return true
+   end
+
+   -- Checks the nn.PixelShuffle layer's forward pass. It checks that is
+   -- re-arranges input pixels correctly according to Eq. 4 of
+   -- "Real-Time Single Image and Video Super-Resolution Using an Efficient
+   -- Sub-Pixel Convolutional Neural Network", Shi et al.
+   -- This function tests for multip batch sizes, multiple channels and multiple input dimensions (square)
+   -- It also tests for normal tensors (un-batched)
+   local function testPixelShuffleUpdateOutput()
+      --Test with batched input
+      for h = 1, 3 do
+         local batchSize = torch.round(torch.uniform(1, 3))
+         for i = 1, 3 do
+            local upscaleFactor = torch.round(torch.uniform(2,5))
+            local pixelShuffle = nn.PixelShuffle(upscaleFactor)
+            for j = 1, 3 do
+               local channels = torch.round(torch.uniform(1, 4))
+               for k = 1, 3 do
+
+                     local inputDim = torch.round(torch.uniform(5, 10))
+                     local input = torch.Tensor(batchSize, channels * upscaleFactor * upscaleFactor, inputDim, inputDim)
+                     input:uniform()
+
+                     local output = pixelShuffle:forward(input)
+                     local expectedOutputDim = inputDim * upscaleFactor
+                     mytester:assert(tensorHasSize(output, {batchSize, channels, expectedOutputDim, expectedOutputDim}),
+                     string.format("Output tensor should have size (%d, %d, %d, %d) not %s", batchSize, channels, expectedOutputDim, expectedOutputDim, tostring(output:size())))
+                     verifyPixelShuffle(input, output, upscaleFactor)
+               end
+            end
+         end
+      end
+
+      --Test with non-batched input
+      local inputDim = torch.round(torch.uniform(5, 10))
+      local channels = torch.round(torch.uniform(1, 4))
+      local upscaleFactor = torch.round(torch.uniform(2,5))
+
+      local input = torch.Tensor(channels * upscaleFactor * upscaleFactor, inputDim, inputDim)
+      input:uniform()
+
+      local pixelShuffle = nn.PixelShuffle(upscaleFactor)
+      local output = pixelShuffle:forward(input)
+      local expectedOutputDim = inputDim * upscaleFactor
+      mytester:assert(tensorHasSize(output, {channels, expectedOutputDim, expectedOutputDim}),
+      string.format("Output tensor should have size (%d, %d, %d) not %s", channels, expectedOutputDim, expectedOutputDim, tostring(output:size())))
+
+      verifyPixelShuffle(input, output, upscaleFactor)
+   end
+
+   -- Checks the nn.PixelShuffle layer's backward pass. It checks that is
+   -- essentially performs the inverse of Eq 4. in
+   -- "Real-Time Single Image and Video Super-Resolution Using an Efficient
+   -- Sub-Pixel Convolutional Neural Network", Shi et al.
+   -- This function tests for multip batch sizes, multiple channels and multiple input dimensions (square)
+   -- It also tests for normal tensors (un-batched)
+   local function testPixelShuffleUpdateGradInput()
+      --Test with batched input
+      for h = 1, 3 do
+         local batchSize = torch.round(torch.uniform(1, 3))
+         for i = 1, 3 do
+            local upscaleFactor = torch.round(torch.uniform(2,5))
+            local pixelShuffle = nn.PixelShuffle(upscaleFactor)
+               for j = 1, 3 do
+                  local channels = torch.round(torch.uniform(1, 4))
+                  for k = 1, 3 do
+                     local inputDim = torch.round(torch.uniform(5, 10))
+                     local input = torch.Tensor(batchSize, channels * upscaleFactor * upscaleFactor, inputDim, inputDim)
+
+                     input:uniform()
+
+                     local output = pixelShuffle:forward(input)
+                     --here we treat output as the same as gradOutput as they have the same shape
+                     local reconstructedInput = pixelShuffle:backward(input, output)
+                     mytester:assertTensorEq(reconstructedInput, input, 0)
+                  end
+            end
+         end
+      end
+
+      --Test with non-batched input
+      local inputDim = torch.round(torch.uniform(5, 10))
+      local channels = torch.round(torch.uniform(1, 4))
+      local upscaleFactor = torch.round(torch.uniform(2,5))
+      local input = torch.Tensor(channels * upscaleFactor * upscaleFactor, inputDim, inputDim)
+      input:uniform()
+
+      local pixelShuffle = nn.PixelShuffle(upscaleFactor)
+      local output = pixelShuffle:forward(input)
+      --here we treat output as the same as gradOutput as they have the same shape
+      local reconstructedInput = pixelShuffle:backward(input, output)
+      mytester:assertTensorEq(reconstructedInput, input, 0)
+
+      local err = jac.testJacobian(pixelShuffle, input)
+      mytester:assertlt(err,precision, "error computing gradiens w.r.t. inputs")
+   end
+
+   local function testModuleIO()
+      --Test with non-batched input
+      local inputDim = torch.round(torch.uniform(5, 10))
+      local channels = torch.round(torch.uniform(1, 4))
+      local upscaleFactor = torch.round(torch.uniform(2,5))
+      local input = torch.Tensor(channels * upscaleFactor * upscaleFactor, inputDim, inputDim):uniform()
+      local pixelShuffle = nn.PixelShuffle(upscaleFactor)
+
+      local fwdErr,bkwdErr = jac.testIO(pixelShuffle,input)
+      mytester:asserteq(fwdErr, 0, torch.typename(pixelShuffle) .. " - i/o forward err ")
+      mytester:asserteq(bkwdErr, 0, torch.typename(pixelShuffle) .. " - i/o backward err ")
+   end
+
+   testPixelShuffleUpdateOutput()
+   testPixelShuffleUpdateGradInput()
+   testModuleIO()
+end
+
+function nntest.Typecast()
+  local function make_network()
+    local seq = nn.Sequential()
+    seq:add(nn.Linear(15, 10))
+    seq:add(nn.Linear(15, 10))
+    seq.modules[1].bias:fill(1)
+    seq.modules[2].bias:fill(2)
+    return seq
+  end
+
+  -- make sure that the typecasts aren't nops
+  assert(torch.getdefaulttensortype() == 'torch.DoubleTensor')
+
+  -- basic net
+  local net = make_network()
+  net.modules[1].empty_tensor = torch.Tensor()
+  net:float()
+  assert(net.modules[1].bias:type() == 'torch.FloatTensor',
+      net.modules[1].bias:type())
+  assert(net.modules[1].empty_tensor:type() == 'torch.FloatTensor')
+  assert(net.modules[1].bias ~= net.modules[2].bias)
+  net.modules[1].bias:fill(3)
+  assert(net.modules[1].bias[1] == 3)
+  assert(net.modules[2].bias[1] == 2)
+
+  -- shared tensors remain shared
+  local net = make_network()
+  net.modules[2].bias = net.modules[1].bias
+  net:float()
+  assert(net.modules[1].bias:type() == 'torch.FloatTensor')
+  assert(net.modules[1].bias == net.modules[2].bias)
+  assert(net.modules[1].bias[1] == 1)
+
+  -- shared storages remain shared
+  local net = make_network()
+  net.modules[2].bias:set(net.modules[1].bias)
+  local net = net:float()
+  assert(net.modules[1].bias:type() == 'torch.FloatTensor')
+  assert(net.modules[1].bias ~= net.modules[2].bias)
+  net.modules[1].bias:fill(3)
+  assert(net.modules[1].bias[1] == 3)
+  assert(net.modules[2].bias[1] == 3)
+
+  -- tricky: overlapping views on the same storage are preserved
+  local net = make_network()
+  local overlap_storage = torch.Tensor(15):fill(1)
+  net.modules[1].bias = overlap_storage:narrow(1, 1, 10)
+  net.modules[2].bias = overlap_storage:narrow(1, 6, 10)
+  net:float()
+  assert(net.modules[1].bias:type() == 'torch.FloatTensor')
+  assert(net.modules[1].bias ~= net.modules[2].bias)
+  net.modules[1].bias:fill(3)
+  assert(net.modules[1].bias[1] == 3)
+  assert(net.modules[2].bias[1] == 3)
+  assert(net.modules[2].bias[6] == 1) -- only the first 5 elements overlapped
+
+  -- check recursiveType on a table
+  local net1 = make_network()
+  local net2 = make_network()
+  net2.modules[1].bias:set(net1.modules[1].bias)
+  net1:float()
+  net2:float()
+  net1.modules[1].bias:fill(3)
+  assert(net2.modules[1].bias[1] == 1)
+
+  local net1 = make_network()
+  local net2 = make_network()
+  net2.modules[1].bias:set(net1.modules[1].bias)
+
+  local tensorCache = {}
+  net1:type('torch.FloatTensor', tensorCache)
+  net2:type('torch.FloatTensor', tensorCache)
+  net1.modules[1].bias:fill(3)
+  assert(net2.modules[1].bias[1] == 3)
+
+  local net1 = make_network()
+  local net2 = make_network()
+  net2.modules[1].bias:set(net1.modules[1].bias)
+
+  nn.utils.recursiveType({net1, net2}, 'torch.FloatTensor')
+  net1.modules[1].bias:fill(3)
+  assert(net2.modules[1].bias[1] == 3)
+
+  -- smoke test some modules with custom type methods
+  local custom_type_modules = {
+    nn.MixtureTable(3),
+    nn.ConcatTable(),
+    nn.Copy(),
+    nn.Copy(nil, nil, nil, true),
+    nn.SpatialContrastiveNormalization(),
+    nn.DotProduct(),
+    nn.PairwiseDistance(1),
+    nn.SpatialDivisiveNormalization(),
+    nn.SpatialSubtractiveNormalization()
+  }
+  for _, module in ipairs(custom_type_modules) do
+    module:float()
+  end
+end
+
+function nntest.Module_apply()
+  local s = nn.Sequential()
+  s:add(nn.Linear(10,10))
+  local s2 = nn.Sequential()
+  s2:add(nn.Linear(10,5))
+  s:add(s2)
+  s:add(nn.Tanh())
+
+  local seen = 0
+  s:apply(function(module)
+    if torch.type(module) == 'nn.Linear' then
+      module.bias:resize(20)
+      seen = seen + 1
+    end
+  end)
+  mytester:asserteq(seen, 2)
+  mytester:asserteq(s.modules[1].bias:size(1), 20)
+  mytester:asserteq(s2.modules[1].bias:size(1), 20)
+end
+
+function nntest.Module_replace()
+   -- test replace in container
+   local s = nn.Sequential()
+   s:add(nn.Linear(10,10))
+   s:add(nn.Sigmoid())
+   s:replace(function(module)
+      return torch.type(module) == 'nn.Sigmoid' and nn.Tanh() or module
+   end)
+   -- test replace of a single module
+   local single = nn.Tanh()
+   local replaced = single:replace(function(module)
+      return torch.type(module) == 'nn.Tanh' and nn.Sigmoid() or module
+   end)
+   mytester:asserteq(torch.type(s:get(2)), 'nn.Tanh', 'replace in container')
+   mytester:asserteq(torch.type(replaced), 'nn.Sigmoid', 'replace in single module')
+end
+
+function nntest.Cosine()
+   local inputSize = 4
+   local outputSize = 5
+
+   -- test 1D
+   local input = torch.randn(inputSize)
+   local gradOutput = torch.randn(outputSize)
+   local cosine = nn.Cosine(inputSize,outputSize)
+   local output = cosine:forward(input)
+   local inputNorm = input:norm()+1e-12
+   local weight2 = cosine.weight[2]
+   local output2 = torch.dot(weight2, input)/((weight2:norm()+1e-12)*inputNorm)
+   mytester:assert(math.abs(output2 - output[2]) < 0.000001,"Cosine output 1D err weight[2]")
+   local output2 = torch.mv(cosine.weight, input)
+   output2:cdiv(cosine.weight:norm(2,2)+1e-12):div(inputNorm)
+   mytester:assertTensorEq(output, output2, 0.000001, "Cosine output 1D err")
+   local gradInput = cosine:updateGradInput(input, gradOutput)
+   local gradInput2 = gradInput:clone():zero()
+   for j=1,outputSize do
+      local w_j = cosine.weight[j]
+      local nw_j = w_j:norm()+1e-12
+      for i=1,inputSize do
+         local w_ij = w_j[i]
+         local grad_i = (w_ij/(inputNorm*nw_j))
+         grad_i = grad_i - (output[j]*input[i]/(inputNorm*inputNorm))
+         grad_i = grad_i * gradOutput[j]
+         gradInput2[i] = gradInput2[i] + grad_i
+      end
+   end
+   mytester:assertTensorEq(gradInput2, gradInput, 0.000001, "Cosine gradInput 1D err")
+   cosine:zeroGradParameters()
+   cosine:accGradParameters(input, gradOutput, 1)
+   local gradWeight2 = cosine.weight:clone():zero()
+   for j=1,outputSize do
+      local w_j = cosine.weight[j]
+      local nw_j = w_j:norm()+1e-12
+      for i=1,inputSize do
+         local w_ij = w_j[i]
+         local gW_ij = (gradOutput[j]/nw_j)  * ( ( input[i] / inputNorm ) - (output[j] * w_ij / nw_j) )
+         gradWeight2[{j,i}] = gW_ij
+      end
+   end
+   mytester:assertTensorEq(cosine.gradWeight, gradWeight2, 0.000001, "Cosine gradWeight 2D err")
+
+   -- test 2D
+   local batchSize = 3
+   local input = torch.randn(batchSize, inputSize)
+   local gradOutput = torch.randn(batchSize, outputSize)
+   cosine:zeroGradParameters()
+   local cosine2 = cosine:clone()
+   local output = cosine:forward(input)
+   local output2 = cosine2:forward(input[2])
+   mytester:assertTensorEq(output[2], output2, 0.000001, "Cosine output 2D err")
+   local gradInput = cosine:backward(input, gradOutput)
+
+   local gradInput2 = gradInput:clone():zero()
+   for i=1,batchSize do
+      cosine2:forward(input[i], gradOutput[i])
+      gradInput2[i]:copy(cosine2:backward(input[i], gradOutput[i]))
+   end
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001, "Cosine gradInput 2D err")
+   mytester:assertTensorEq(cosine.gradWeight, cosine2.gradWeight, 0.000001, "Cosine gradWeight 2D err")
+end
+
+function nntest.DistanceRatioCriterion()
+   local sizeAverage = true
+   local crit = nn.DistanceRatioCriterion(sizeAverage)
+   local X = torch.rand(32,1):fill(1)
+   local Y = torch.rand(32,1):fill(1)
+
+   -- Unit Test updateOutput
+   local loss = crit:forward({X, Y})
+   local trueLoss = 1 + math.log(math.exp(-1) + math.exp(-1))
+   assert(math.abs(loss - trueLoss) < 0.000001,
+          "DistanceRatioCriterion forward incorrect output")
+
+   -- Unit Test updateGradInput
+   local dxdy = crit:backward({X, Y})
+   local dx = dxdy[1]
+   local dy = dxdy[2]
+   assert(math.abs(dx:sum() - 0.5) < 0.000001,
+          "DistanceRatioCriterion backward (dx) incorrect output")
+   assert(math.abs(dy:sum() + 0.5) < 0.000001,
+          "DistanceRatioCriterion backward (dy) incorrect output")
+end
+
+function nntest.ErrorHandling()
+   local l = nn.Linear(1, 1)
+   local p = nn.Parallel(1, 1):add(l)
+   local c = nn.Concat(1):add(p)
+   local model = nn.Sequential():add(nn.Identity()):add(c):add(nn.Identity())
+   local function errmsg(module, i)
+       return 'In ' .. i .. ' module of ' .. torch.type(module) .. ':\n'
+   end
+   local expected_err = errmsg(model, 2) .. errmsg(c, 1) .. errmsg(p, 1)
+   mytester:assertErrorObj(
+       function()
+           model:forward(torch.randn(1,2,2))
+       end,
+       function(err)
+           return err:find(expected_err) and err:find('size mismatch')
+       end,
+       "Failure expected or bad error message (missing information or reason)"
+   )
+end
+
+function nntest.GPU()
+   -- this is a placeholder to let you know that the nn.GPU unit test
+   -- is located in cunn package.
+end
+
+function nntest.Profile()
+   local mx_overhead = 0.05
+   local print_every = 3
+   local net = nn.Profile(nn.Linear(3,4), print_every)
+   local input, gradOutput = torch.randn(1, 3), torch.randn(1, 4)
+   local output, gradInput = net:forward(input), net:backward(input, gradOutput)
+   mytester:assertTensorEq(net.modules[1].output, output, 0.000001)
+   mytester:assertTensorEq(net.modules[1].gradInput, gradInput, 0.000001)
+end
+
+function nntest.NaN()
+   local _ = require 'moses'
+   local input = torch.randn(2,3)
+   local gradOutput = torch.randn(2,4)
+   local lin = nn.Linear(3,4)
+   lin:zeroGradParameters()
+   local nan = nn.NaN(lin)
+   mytester:assert(nan.id == 1)
+   -- test that it works when no NaNs are present
+   local output = nan:forward(input):clone()
+   local gradInput = nan:backward(input, gradOutput):clone()
+   local gradWeight = lin.gradWeight:clone()
+   local gradBias = lin.gradBias:clone()
+   lin:zeroGradParameters()
+   local output2 = lin:forward(input)
+   local gradInput2 = lin:backward(input, gradOutput)
+   mytester:assertTensorEq(output, output2, 0.000001)
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001)
+   mytester:assertTensorEq(gradWeight, lin.gradWeight, 0.000001)
+   mytester:assertTensorEq(gradBias, lin.gradBias, 0.000001)
+   -- test with some NaNs
+   input:zero():log():log()
+   local sum = input:sum()
+   mytester:assert(_.isNaN(sum))
+   mytester:assert(not pcall(function() nan:forward(input) end))
+   lin.bias:fill(sum)
+   input = torch.randn(2,3)
+   mytester:assert(not pcall(function() nan:forward(input) end))
+   lin.bias:uniform(0,1)
+   gradOutput:fill(sum)
+   mytester:assert(not pcall(function() nan:backward(input, gradOutput) end))
+   gradOutput:uniform(0,1)
+   lin.gradBias:fill(sum)
+   mytester:assert(not pcall(function() nan:backward(input, gradOutput) end))
+end
+
+function nntest.DontCast()
+   local input = torch.randn(3,4)
+   local gradOutput = torch.randn(3,2)
+   local linear = nn.Linear(4,2):float()
+   local mlp = nn.DontCast(linear, true)
+   linear:zeroGradParameters()
+   local linear = linear:clone()
+   local output = mlp:forward(input)
+   local gradInput = mlp:backward(input, gradOutput)
+   mytester:assert(torch.type(output) == 'torch.DoubleTensor')
+   mytester:assert(torch.type(gradInput) == 'torch.DoubleTensor')
+   local output2 = linear:forward(input:float())
+   local gradInput2 = linear:backward(input:float(), gradOutput:float())
+   mytester:assertTensorEq(output:float(), output2, 0.000001)
+   mytester:assertTensorEq(gradInput:float(), gradInput2, 0.000001)
+   local mlp3 = nn.DontCast(linear:clone())
+   mlp3:zeroGradParameters()
+   local output3 = mlp3:forward(input:float())
+   local gradInput3 = mlp3:backward(input:float(), gradOutput:float())
+   mytester:assert(torch.type(output3) == 'torch.FloatTensor')
+   mytester:assert(torch.type(gradInput3) == 'torch.FloatTensor')
+   mytester:assertTensorEq(output3, output2, 0.000001)
+   mytester:assertTensorEq(gradInput3, gradInput2, 0.000001)
+
+   mlp:float()
+   local output4 = mlp:forward(input:float())
+   local gradInput4 = mlp:backward(input:float(), gradOutput:float())
+   mytester:assert(torch.type(output4) == 'torch.FloatTensor')
+   mytester:assert(torch.type(gradInput4) == 'torch.FloatTensor')
+   mytester:assertTensorEq(output3, output4, 0.000001)
+   mytester:assertTensorEq(gradInput3, gradInput4, 0.000001)
+   mlp:double()
+   mytester:assert(torch.type(linear.output) == 'torch.FloatTensor')
+   local output = mlp:forward(input)
+   local gradInput = mlp:backward(input, gradOutput)
+   mytester:assert(torch.type(output4) == 'torch.FloatTensor')
+   mytester:assert(torch.type(gradInput4) == 'torch.FloatTensor')
+   mytester:assertTensorEq(output3, output:float(), 0.000001)
+   mytester:assertTensorEq(gradInput3, gradInput:float(), 0.000001)
+
+   -- test table inputs/outputs
+   local input = {torch.randn(3,4), torch.randn(3,4)}
+   local gradOutput = {torch.randn(3,2), torch.randn(3,2)}
+   local linear = nn.ParallelTable():add(nn.Linear(4,2)):add(nn.Linear(4,2)):float()
+   local mlp = nn.DontCast(linear, true)
+   linear:zeroGradParameters()
+   local linear = linear:clone()
+   local output = mlp:forward(input)
+   local gradInput = mlp:backward(input, gradOutput)
+   mytester:assert(torch.type(output[1]) == 'torch.DoubleTensor')
+   mytester:assert(torch.type(gradInput[1]) == 'torch.DoubleTensor')
+   mytester:assert(torch.type(output[2]) == 'torch.DoubleTensor')
+   mytester:assert(torch.type(gradInput[2]) == 'torch.DoubleTensor')
+   local _ = require 'moses'
+   local finput = _.map(input, function(k,v) return v:float() end)
+   local foutput = _.map(output, function(k,v) return v:float() end)
+   local fgradInput = _.map(gradInput, function(k,v) return v:float() end)
+   local fgradOutput = _.map(gradOutput, function(k,v) return v:float() end)
+   local output2 = linear:forward(finput)
+   local gradInput2 = linear:backward(finput, fgradOutput)
+   mytester:assertTensorEq(foutput[1], output2[1], 0.000001)
+   mytester:assertTensorEq(foutput[2], output2[2], 0.000001)
+   mytester:assertTensorEq(fgradInput[1], gradInput2[1], 0.000001)
+   mytester:assertTensorEq(fgradInput[2], gradInput2[2], 0.000001)
+   local mlp3 = nn.DontCast(linear:clone())
+   mlp3:zeroGradParameters()
+   local output3 = mlp3:forward(finput)
+   local gradInput3 = mlp3:backward(finput, fgradOutput)
+   mytester:assert(torch.type(output3[1]) == 'torch.FloatTensor')
+   mytester:assert(torch.type(gradInput3[1]) == 'torch.FloatTensor')
+   mytester:assert(torch.type(output3[2]) == 'torch.FloatTensor')
+   mytester:assert(torch.type(gradInput3[2]) == 'torch.FloatTensor')
+   mytester:assertTensorEq(output3[1], output2[1], 0.000001)
+   mytester:assertTensorEq(gradInput3[1], gradInput2[1], 0.000001)
+   mytester:assertTensorEq(output3[2], output2[2], 0.000001)
+   mytester:assertTensorEq(gradInput3[2], gradInput2[2], 0.000001)
+   mlp:float()
+   local output4 = mlp:forward(finput)
+   local gradInput4 = mlp:backward(finput, fgradOutput)
+   mytester:assert(torch.type(output4[1]) == 'torch.FloatTensor')
+   mytester:assert(torch.type(gradInput4[1]) == 'torch.FloatTensor')
+   mytester:assert(torch.type(output4[2]) == 'torch.FloatTensor')
+   mytester:assert(torch.type(gradInput4[2]) == 'torch.FloatTensor')
+   mytester:assertTensorEq(output3[1], output4[1], 0.000001)
+   mytester:assertTensorEq(gradInput3[1], gradInput4[1], 0.000001)
+   mytester:assertTensorEq(output3[2], output4[2], 0.000001)
+   mytester:assertTensorEq(gradInput3[2], gradInput4[2], 0.000001)
+   mlp:double()
+   mytester:assert(torch.type(linear.output) == 'table')
+   mytester:assert(torch.type(linear.output[1]) == 'torch.FloatTensor')
+   mytester:assert(torch.type(linear.output[2]) == 'torch.FloatTensor')
+   local output = mlp:forward(input)
+   local gradInput = mlp:backward(input, gradOutput)
+   mytester:assertTensorEq(output3[1], output[1]:float(), 0.000001)
+   mytester:assertTensorEq(gradInput3[1], gradInput[1]:float(), 0.000001)
+end
+
+function nntest.SpatialDepthWiseConvolution()
+   local epsilon = 0.00001
+
+   local SC = nn.SpatialConvolution
+   local SDWC = nn.SpatialDepthWiseConvolution
+
+   local function spatialDepthWiseConv(
+         nInputPlane, multiplier, kernel, stride, padding, inputSize, weight, bias
+      )
+      local conv = SDWC(nInputPlane, multiplier, kernel, kernel, stride, stride, padding, padding)
+      conv.weight = weight
+      conv.bias = bias
+      return conv
+   end
+
+   -- Utility spatialDepthWiseConv_util() function --------------------------------
+   -- By Alfredo Canziani, alfredo.canziani@gmail.com -----------------------------
+   local function spatialDepthWiseConv_util(
+         nInputPlane, multiplier, kernel, stride, padding, inputSize, weight, bias
+      )
+
+      local conv = nn.Sequential()
+      conv:add(nn.Contiguous())
+      conv:add(nn.View(-1, 1, inputSize, inputSize))
+      conv:add(SC(1, multiplier, kernel, kernel, stride, stride, padding, padding))
+
+      local depthWiseConv = nn.Parallel(2, 2)
+      for channel = 1, nInputPlane do
+         local tempConv = conv:clone()
+         tempConv:get(3).weight = weight:narrow(2, channel, 1):clone()
+         tempConv:get(3).bias = bias:select(2, channel):clone()
+        depthWiseConv:add(tempConv)
+      end
+      depthWiseConv:add(nn.Contiguous())
+      return depthWiseConv
+   end
+
+   local n = 3 -- nInputPlane
+   local s = 28 -- input height and width
+   local b = 3 -- batch size
+   local m = 4 -- multiplier
+   local k = 3 -- kernel size
+   local p = 1 -- padding
+   local st = 1 -- stride
+
+   local testBatch = 1e3 -- number of repetition
+
+   local X = torch.rand(b, n, s, s) -- 1x3x299x299 images
+   local weight = torch.rand(m, n, k, k) -- weight
+   local bias = torch.rand(m, n) -- bias
+
+   local model = spatialDepthWiseConv(n, m, k, st, p, s, weight, bias)
+   local model_util = spatialDepthWiseConv_util(n, m, k, st, p, s, weight, bias)
+
+   local Y_util = model_util:forward(X)
+   local Y = model:forward(X)
+
+   local abs_diff = Y_util:clone():csub(Y):abs()
+   mytester:assert(torch.all(abs_diff:lt(epsilon)))
+end
+
+function nntest.Constant()
+   local input = torch.randn(20,3,7)
+   local gradOutput = torch.randn(20,30,6)
+   local value = torch.randn(30,6)
+   local const = nn.Constant(value:clone(), 2)
+   local output = const:forward(input)
+   local gradInput = const:backward(input, output)
+   local output2 = value:view(1,30,6):expand(20,30,6)
+   mytester:assertTensorEq(output2, output, 0.000001, "Constant forward err")
+   mytester:assertTensorEq(gradInput, input:zero(), 0.000001, "Constant backward err")
+end
+
+function nntest.WhiteNoise()
+   local input = torch.zeros(3, 28, 28)
+   local addNoise = nn.WhiteNoise()
+   local output = addNoise:forward(input)
+   local meanValue = output:mean()
+   local stdValue = output:std()
+   mytester:assert(meanValue > -0.01 and meanValue < 0.01)
+   mytester:assert(stdValue < 0.15 and stdValue >= 0)
+
+   -- Evaluate
+   addNoise:evaluate()
+   output = addNoise:forward(input)
+   meanValue = output:mean()
+   stdValue = output:std()
+   mytester:assert(meanValue == 0)
+   mytester:assert(stdValue == 0)
+
+   -- backprop
+   addNoise:training()
+   local gradOutput = torch.rand(3, 28, 28)
+   local gradInput = addNoise:updateGradInput(input, gradOutput)
+   mytester:assertTensorEq(gradOutput, gradInput, 0.000001, "WhiteNoise backward err")
+end
+
+function nntest.OneHot()
+   local nClass = 10
+
+   -- batch mode
+   local batchSize = 3
+   local input = torch.LongTensor(batchSize):random(1, nClass)
+   local gradOutput = torch.randn(batchSize, nClass)
+
+   local oh = nn.OneHot(nClass)
+
+   local output = oh:forward(input)
+   local output2 = torch.Tensor(batchSize, nClass):zero()
+   local eye = torch.eye(nClass)
+   output2:index(eye, 1, input)
+   mytester:assertTensorEq(output, output2, 0.000001, "OneHot forward batch err")
+   mytester:assert(output:dim() == 2)
+
+   -- non-batch mode (number input)
+   local num = 3
+   local output3 = torch.zeros(nClass)
+   output3[num] = 1.0
+   mytester:assertTensorEq(oh:forward(num), output3, 0.000001, "OneHot forward number err")
+
+   local gradInput = oh:backward(input, gradOutput)
+   mytester:assertTensorEq(gradInput, input:double():zero(), 0.000001, "OneHot backward batch err")
+
+   if pcall(function() require 'cunn' end) then
+      oh:cuda()
+
+      -- test with long input
+      local output = oh:forward(input)
+      mytester:assert(torch.type(output) == 'torch.CudaTensor')
+      mytester:assertTensorEq(output:double(), output2, 0.000001, "OneHot forward batch long-cuda err")
+
+      -- test with cuda input
+      local input = input:cuda()
+      gradOutput = gradOutput:cuda()
+
+      local output = oh:forward(input)
+      mytester:assert(torch.type(output) == 'torch.CudaTensor')
+      mytester:assertTensorEq(output:double(), output2, 0.000001, "OneHot forward batch cuda err")
+
+      local gradInput2 = oh:backward(input, gradOutput)
+      mytester:assertTensorEq(gradInput, gradInput2:double(), 0.000001, "OneHot backward batch err")
+      cutorch.synchronize()
+
+      -- non-batch mode (number input)
+      mytester:assertTensorEq(oh:forward(num), output3:cuda(), 0.000001, "OneHot forward number err")
+   end
+
+   -- multi-dimensional input
+   local inputSize = 2
+   local input = torch.LongTensor(batchSize, inputSize):random(1, nClass)
+   local gradOutput = torch.randn(batchSize, inputSize, nClass)
+
+   local oh = nn.OneHot(nClass, 2)
+
+   local output = oh:forward(input)
+   local output2 = torch.Tensor(batchSize*inputSize, nClass):zero()
+   local eye = torch.eye(nClass)
+   output2:index(eye, 1, input:view(-1))
+   output2:resize(batchSize, inputSize, nClass)
+   mytester:assertTensorEq(output, output2, 0.000001, "OneHot 2d forward batch err")
+   mytester:assert(output:dim() == 3)
+
+   local gradInput = oh:backward(input, gradOutput)
+   mytester:assertTensorEq(gradInput, input:double():zero(), 0.000001, "OneHot 2d backward batch err")
+
+   if pcall(function() require 'cunn' end) then
+      oh:cuda()
+
+      -- test with long input
+      local output = oh:forward(input)
+      mytester:assert(torch.type(output) == 'torch.CudaTensor')
+      mytester:assertTensorEq(output:double(), output2, 0.000001, "OneHot 2d forward batch long-cuda err")
+
+      -- test with cuda input
+      local input = input:cuda()
+      gradOutput = gradOutput:cuda()
+
+      local output = oh:forward(input)
+      mytester:assert(torch.type(output) == 'torch.CudaTensor')
+      mytester:assertTensorEq(output:double(), output2, 0.000001, "OneHot 2d forward batch cuda err")
+
+      local gradInput2 = oh:backward(input, gradOutput)
+      mytester:assertTensorEq(gradInput, gradInput2:double(), 0.000001, "OneHot 2d backward batch err")
+
+      local benchmark = false
+      if benchmark then
+         local input = torch.FloatTensor(50, 50):random(1,65):cuda()
+
+         local oh = nn.OneHot(65):cuda()
+
+         oh:forward(input)
+         cutorch.synchronize()
+         local a = torch.Timer()
+         for i=1,10 do
+            oh:forward(input)
+         end
+         cutorch.synchronize()
+         local gputime = a:time().real
+
+         oh:float()
+         input = input:float()
+         oh:forward(input)
+         a = torch.Timer()
+         for i=1,10 do
+            oh:forward(input)
+         end
+         local cputime = a:time().real
+         print("Onehot GPU vs CPU time", gputime, cputime)
+      end
+   end
+end
+
+function nntest.ZeroGrad()
+   local input = torch.randn(3,4)
+   local zg = nn.ZeroGrad()
+   local output = zg:forward(input)
+   mytester:assertTensorEq(input, output, 0.00000001)
+   local gradInput = zg:backward(input, input)
+   local gradInput2 = gradInput:clone():zero()
+   mytester:assertTensorEq(gradInput, gradInput2, 0.0000001)
+end
+
+function nntest.ZipTable()
+   -- input : { {a1,a2}, {b1,b2}, {c1,c2} }
+   -- output : { {a1,b1,c1}, {a2,b2,c2} }
+   local z = nn.ZipTable()
+   local input = {
+      {torch.randn(3,4), torch.randn(3,4)},
+      {torch.randn(3,4), torch.randn(3,4)},
+      {torch.randn(3,4), torch.randn(3,4)}
+   }
+   local output = z:forward(input)
+   mytester:assert(#output == 2, "ZipTable #output")
+   mytester:assert(#(output[1]) == 3, "ZipTable #output[1]")
+   mytester:assertTensorEq(input[1][1], output[1][1], 0.000001, "ZipTable input11")
+   mytester:assertTensorEq(input[1][2], output[2][1], 0.000001, "ZipTable input12")
+   mytester:assertTensorEq(input[3][2], output[2][3], 0.000001, "ZipTable input32")
+   local gradInput = z:backward(input, output)
+   mytester:assert(#gradInput == 3, "ZipTable #gradInput")
+   mytester:assert(#(gradInput[1]) == 2, "ZipTable #gradInput[1]")
+   mytester:assertTensorEq(input[1][1], gradInput[1][1], 0.000001, "ZipTable gradInput11")
+   mytester:assertTensorEq(input[1][2], gradInput[1][2], 0.000001, "ZipTable gradInput12")
+   mytester:assertTensorEq(input[3][2], gradInput[3][2], 0.000001, "ZipTable gradInput32")
+end
+
+function nntest.ZipTableOneToMany()
+   -- input : { v, {a,b,c} }
+   -- output : { {v,a}, {v,b}, {v,c} }
+   local z = nn.ZipTableOneToMany()
+   local input = { torch.randn(3), { torch.randn(4), torch.rand(4), torch.rand(4) } }
+   local output = z:forward(input)
+   mytester:assert(#output == 3, "ZipTableOneToMany #output")
+   mytester:assert(#(output[1]) == 2, "ZipTableOneToMany #output[1]")
+   mytester:assert(#(output[2]) == 2, "ZipTableOneToMany #output[2]")
+   mytester:assert(#(output[3]) == 2, "ZipTableOneToMany #output[3]")
+   mytester:assertTensorEq(input[1], output[1][1], 0.000001, "ZipTableOneToMany input1 output11")
+   mytester:assertTensorEq(input[1], output[2][1], 0.000001, "ZipTableOneToMany input1 output21")
+   mytester:assertTensorEq(input[1], output[3][1], 0.000001, "ZipTableOneToMany input1 output31")
+   mytester:assertTensorEq(input[2][1], output[1][2], 0.000001, "ZipTableOneToMany input21")
+   mytester:assertTensorEq(input[2][2], output[2][2], 0.000001, "ZipTableOneToMany input22")
+   mytester:assertTensorEq(input[2][3], output[3][2], 0.000001, "ZipTableOneToMany input23")
+   local gradInput = z:backward(input, output)
+   mytester:assert(#gradInput == 2, "ZipTableOneToMany #gradInput")
+   mytester:assert(#(gradInput[2]) == 3, "ZipTableOneToMany #gradInput[2]")
+   mytester:assertTensorEq(input[2][1], gradInput[2][1], 0.000001, "ZipTableOneToMany gradInput21")
+   mytester:assertTensorEq(input[2][2], gradInput[2][2], 0.000001, "ZipTableOneToMany gradInput22")
+   mytester:assertTensorEq(input[2][3], gradInput[2][3], 0.000001, "ZipTableOneToMany gradInput32")
+   mytester:assertTensorEq(torch.mul(input[1], 3), gradInput[1], 0.000001, "ZipTableOneToMany gradInput21")
+end
+
+function nntest.Collapse()
+   local c = nn.Collapse(3)
+   local input = torch.randn(8,3,4,5)
+   local output = c:forward(input)
+   mytester:assertTensorEq(input:view(8,-1), output, 0.000001, "Collapse:forward")
+   local gradInput = c:backward(input, output)
+   mytester:assertTensorEq(gradInput, input, 0.000001, "Collapse:backward")
+   mytester:assertTableEq(gradInput:size():totable(), input:size():totable(), 0.000001, "Collapse:backward size")
+   local input2 = input:transpose(1,4)
+   local output2 = c:forward(input2)
+   mytester:assertTensorEq(input2:contiguous():view(5,-1), output2, 0.000001, "Collapse:forward non-contiguous")
+   local gradInput2 = c:backward(input2, output2)
+   mytester:assertTensorEq(gradInput2, input2, 0.000001, "Collapse:backward non-contiguous")
+   mytester:assertTableEq(gradInput2:size():totable(), input2:size():totable(), 0.000001, "Collapse:backward size non-contiguous")
+end
+
+function nntest.Convert()
+   -- batch mode
+   local c = nn.Convert('bchw', 'chwb')
+   local input = torch.randn(8,3,5,5)
+   local output = c:forward(input)
+   local output2 = input:transpose(1,4):transpose(1,3):transpose(1,2)
+   mytester:assertTensorEq(output, output2, 0.000001, "Convert fwd bchw->chwb")
+   local gradInput = c:backward(input, output)
+   mytester:assertTensorEq(gradInput, input, 0.000001, "Convert bwd bchw->chwb")
+   local c = nn.Convert('bchw', 'bf')
+   local output = c:forward(input)
+   local output2 = input:view(8,-1)
+   mytester:assertTensorEq(output, output2, 0.000001, "Convert fwd bchw->bf")
+   c:float()
+   local output = c:forward(input:float())
+   mytester:assertTensorEq(output, output2:float(), 0.000001, "Convert:type()")
+   local output = c:forward(input)
+   mytester:assertTensorEq(output, output2:float(), 0.000001, "Convert:type() double->float")
+   -- non-batch mode
+   local c = nn.Convert('chw', 'hwc')
+   local input = torch.randn(3,5,5)
+   local output = c:forward(input)
+   local output2 = input:transpose(1,3):transpose(1,2)
+   mytester:assertTensorEq(output, output2, 0.000001, "Convert fwd chw->hwc non-batch")
+   local gradInput = c:backward(input, output)
+   mytester:assertTensorEq(gradInput, input, 0.000001, "Convert bwd chw->hwc non-batch")
+   local c = nn.Convert('chw', 'f')
+   local output = c:forward(input)
+   local output2 = input:view(-1)
+   mytester:assertTensorEq(output, output2, 0.000001, "Convert fwd chw->bf non-batch")
+   c:float()
+   local output = c:forward(input:float())
+   mytester:assertTensorEq(output, output2:float(), 0.000001, "Convert:type() non-batch")
+   local output = c:forward(input)
+   mytester:assertTensorEq(output, output2:float(), 0.000001, "Convert:type() double->float non-batch")
+end
+
+function nntest.CAddTensorTable()
+   -- input : { v, {a,b,c} }
+   -- output : { v+a, v+b, v+c }
+   local z = nn.CAddTensorTable()
+   local input = { torch.randn(3), { torch.randn(3), torch.rand(3), torch.rand(3) } }
+   local output = z:forward(input)
+   mytester:assert(#output == 3, "CAddTensorTable #output")
+   mytester:assertTensorEq(input[1]+input[2][1], output[1], 0.00001, "CAddTensorTable input21 output1")
+   mytester:assertTensorEq(input[1]+input[2][2], output[2], 0.00001, "CAddTensorTable input22 output2")
+   mytester:assertTensorEq(input[1]+input[2][3], output[3], 0.00001, "CAddTensorTable input23 output3")
+   local gradInput = z:backward(input, output)
+   mytester:assert(#gradInput == 2, "CAddTensorTable #gradInput")
+   mytester:assert(#(gradInput[2]) == 3, "CAddTensorTable #gradInput[2]")
+   mytester:assertTensorEq(output[1], gradInput[2][1], 0.000001, "CAddTensorTable gradInput21")
+   mytester:assertTensorEq(output[2], gradInput[2][2], 0.000001, "CAddTensorTable gradInput22")
+   mytester:assertTensorEq(output[3], gradInput[2][3], 0.000001, "CAddTensorTable gradInput23")
+   mytester:assertTensorEq(output[1]+output[2]+output[3], gradInput[1], 0.000001, "CAddTensorTable gradInput1")
+end
+
+-- Unit Test Kmeans layer
+function nntest.Kmeans()
+   local k = 3
+   local dim = 5
+   local batchSize = 200
+   local input = torch.Tensor(batchSize, dim)
+   for i=1, batchSize do
+      input[i]:fill(torch.random(1, k))
+   end
+
+   local verbose = false
+
+   local attempts = 10
+   local iter = 100
+   local bestLoss = 100000000
+   local bestKm = nil
+   local tempLoss = 0
+   local learningRate = 1
+
+   local initTypes = {'random', 'kmeans++'}
+   local useCudas = {false}
+   if pcall(function() require 'cunn' end) then
+      useCudas[2] = true
+   end
+   for _, initType in pairs(initTypes) do
+      for _, useCuda in pairs(useCudas) do
+
+         if useCuda then
+            input = input:cuda()
+         else
+            input = input:double()
+         end
+
+         local timer = torch.Timer()
+         for j=1, attempts do
+            local km = nn.Kmeans(k, dim)
+            if useCuda then km:cuda() end
+
+            if initType == 'kmeans++' then
+               km:initKmeansPlus(input)
+            else
+               km:initRandom(input)
+            end
+
+            for i=1, iter do
+               km:zeroGradParameters()
+
+               km:forward(input)
+               km:backward(input, gradOutput)
+
+               -- Gradient descent
+               km.weight:add(-learningRate, km.gradWeight)
+               tempLoss = km.loss
+            end
+            if verbose then print("Attempt Loss " .. j ..": " .. tempLoss) end
+            if tempLoss < bestLoss then
+               bestLoss = tempLoss
+            end
+            if (initType == 'kmeans++' and bestLoss < 0.00001) or (initType == 'random' and bestLoss < 500) then
+               break
+            end
+         end
+         if verbose then
+            print("InitType: " .. initType .. " useCuda: " .. tostring(useCuda))
+            print("Best Loss: " .. bestLoss)
+            print("Total time: " .. timer:time().real)
+         end
+         if initType == 'kmeans++' then
+            mytester:assert(bestLoss < 0.00001, "Kmeans++ error ("..(useCuda and 'cuda' or 'double')..")")
+         else
+            mytester:assert(bestLoss < 500, "Kmeans error ("..(useCuda and 'cuda' or 'double')..")")
+         end
+      end
+   end
+end
+
+mytester:add(nntest)
+
+jac = nn.Jacobian
+sjac = nn.SparseJacobian
+function nn.test(tests, seed)
+   -- Limit number of threads since everything is small
+   local nThreads = torch.getnumthreads()
+   torch.setnumthreads(1)
+   -- randomize stuff
+   local seed = seed or (1e5 * torch.tic())
+   print('Seed: ', seed)
+   math.randomseed(seed)
+   torch.manualSeed(seed)
+   mytester:run(tests)
+   torch.setnumthreads(nThreads)
+   return mytester
+end
+
+function nn.testTHNN(tests, seed)
+   require 'test.LinearTHNN'
+   nn.Linear = nn.LinearTHNN
+   return nn.test(tests,seed)
+end
diff --git a/contrib/lua-torch/nn/utils.lua b/contrib/lua-torch/nn/utils.lua
new file mode 100644
index 000000000..17b52afb3
--- /dev/null
+++ b/contrib/lua-torch/nn/utils.lua
@@ -0,0 +1,223 @@
+nn.utils = {}
+
+-- oops; someone forgot to add torch.Storage.type
+-- TODO replace with torch.Storage.type when implemented
+local function torch_Storage_type(self, type)
+   local current = torch.typename(self)
+   if not type then return current end
+   if type ~= current then
+      local new = torch.getmetatable(type).new()
+      if self:size() > 0 then
+         new:resize(self:size()):copy(self)
+      end
+      return new
+   else
+      return self
+   end
+end
+
+-- tensorCache maintains a list of all tensors and storages that have been
+-- converted (recursively) by calls to recursiveType() and type().
+-- It caches conversions in order to preserve sharing semantics
+-- i.e. if two tensors share a common storage, then type conversion
+-- should preserve that.
+--
+-- You can preserve sharing semantics across multiple networks by
+-- passing tensorCache between the calls to type, e.g.
+--
+-- > tensorCache = {}
+-- > net1:type('torch.CudaTensor', tensorCache)
+-- > net2:type('torch.CudaTensor', tensorCache)
+-- > nn.utils.recursiveType(anotherTensor, 'torch.CudaTensor', tensorCache)
+--
+-- Implementation note: to make Lua table lookup behave correctly,
+-- tensor keys are stored as actual tensor objects, while storage
+-- keys are stored as the pointers themselves (as numbers).
+function nn.utils.recursiveType(param, type, tensorCache)
+   tensorCache = tensorCache or {}
+
+   if torch.type(param) == 'table' then
+      for k, v in pairs(param) do
+         param[k] = nn.utils.recursiveType(v, type, tensorCache)
+      end
+   elseif torch.isTypeOf(param, 'nn.Module') or
+          torch.isTypeOf(param, 'nn.Criterion') then
+      param:type(type, tensorCache)
+   elseif torch.isTensor(param) then
+      if torch.typename(param) ~= type then
+         local newparam
+         if tensorCache[param] then
+            newparam = tensorCache[param]
+         else
+            newparam = torch.Tensor():type(type)
+            local storageType = type:gsub('Tensor','Storage')
+            if param:storage() then
+               local storage_key = torch.pointer(param:storage())
+               if not tensorCache[storage_key] then
+                  tensorCache[storage_key] = torch_Storage_type(
+                        param:storage(), storageType)
+               end
+               assert(torch.type(tensorCache[storage_key]) == storageType)
+               newparam:set(
+                  tensorCache[storage_key],
+                  param:storageOffset(),
+                  param:size(),
+                  param:stride()
+               )
+            end
+            tensorCache[param] = newparam
+         end
+         assert(torch.type(newparam) == type)
+         param = newparam
+      end
+   end
+   return param
+end
+
+function nn.utils.recursiveResizeAs(t1,t2)
+   if torch.type(t2) == 'table' then
+      t1 = (torch.type(t1) == 'table') and t1 or {t1}
+      for key,_ in pairs(t2) do
+         t1[key], t2[key] = nn.utils.recursiveResizeAs(t1[key], t2[key])
+      end
+      for key,_ in pairs(t1) do
+         if not t2[key] then
+            t1[key] = nil
+         end
+      end
+   elseif torch.isTensor(t2) then
+      t1 = torch.isTensor(t1) and t1 or t2.new()
+      t1:resize(t2:size())
+   else
+      error("expecting nested tensors or tables. Got "..
+            torch.type(t1).." and "..torch.type(t2).." instead")
+   end
+   return t1, t2
+end
+
+function nn.utils.recursiveFill(t2, val)
+   if torch.type(t2) == 'table' then
+      for key,_ in pairs(t2) do
+         t2[key] = nn.utils.recursiveFill(t2[key], val)
+      end
+   elseif torch.isTensor(t2) then
+      t2:fill(val)
+   else
+      error("expecting tensor or table thereof. Got "
+           ..torch.type(t2).." instead")
+   end
+   return t2
+end
+
+function nn.utils.recursiveAdd(t1, val, t2)
+   if not t2 then
+      assert(val, "expecting at least two arguments")
+      t2 = val
+      val = 1
+   end
+   val = val or 1
+   if torch.type(t2) == 'table' then
+      t1 = (torch.type(t1) == 'table') and t1 or {t1}
+      for key,_ in pairs(t2) do
+         t1[key], t2[key] = nn.utils.recursiveAdd(t1[key], val, t2[key])
+      end
+   elseif torch.isTensor(t1) and torch.isTensor(t2) then
+      t1:add(val, t2)
+   else
+      error("expecting nested tensors or tables. Got "..
+            torch.type(t1).." and "..torch.type(t2).." instead")
+   end
+   return t1, t2
+end
+
+function nn.utils.recursiveCopy(t1,t2,async)
+   if torch.type(t2) == 'table' then
+      t1 = (torch.type(t1) == 'table') and t1 or {t1}
+      for key,_ in pairs(t2) do
+         t1[key], t2[key] = nn.utils.recursiveCopy(t1[key], t2[key], async)
+      end
+   elseif torch.isTensor(t2) then
+      t1 = torch.isTensor(t1) and t1 or t2.new()
+      t1:resize(t2:size())
+      if async then
+        t1:copyAsync(t2)
+      else
+        t1:copy(t2)
+      end
+   else
+      error("expecting nested tensors or tables. Got "..
+            torch.type(t1).." and "..torch.type(t2).." instead")
+   end
+   return t1, t2
+end
+
+function nn.utils.addSingletonDimension(...)
+  local view, t, dim
+  if select('#',...) < 3 then
+    t, dim = select(1,...)
+  else
+    view, t, dim = select(1,...)
+    assert(torch.isTensor(view),
+           "output tensor expected, got " .. type(view))
+  end
+
+  assert(torch.isTensor(t), "input tensor expected")
+  dim = dim or 1
+  assert(dim > 0 and dim <= (t:dim() + 1), "invalid dimension: " .. dim
+             .. '. Tensor is of ' .. t:dim() .. ' dimensions.')
+
+  view = view or t.new()
+  local size = torch.LongStorage(t:dim() + 1)
+  local stride = torch.LongStorage(t:dim() + 1)
+
+  for d = 1, dim - 1 do
+    size[d] = t:size(d)
+    stride[d] = t:stride(d)
+  end
+  size[dim] = 1
+  stride[dim] = 1
+  for d = dim + 1, t:dim() + 1 do
+    size[d] = t:size(d - 1)
+    stride[d] = t:stride(d - 1)
+  end
+
+  view:set(t:storage(), t:storageOffset(), size, stride)
+  return view
+end
+
+function nn.utils.contiguousView(output, input, ...)
+  output = output or input.new()
+  if input:isContiguous() then
+    output:view(input, ...)
+  else
+    output:resize(input:size())
+    output:copy(input)
+    output:view(output, ...)
+  end
+  return output
+end
+
+-- go over specified fields and clear them. accepts
+-- nn.utils.clearState(self, {'_buffer', '_buffer2'}) and
+-- nn.utils.clearState(self, '_buffer', '_buffer2')
+function nn.utils.clear(self, ...)
+   local arg = {...}
+   if #arg > 0 and type(arg[1]) == 'table' then
+      arg = arg[1]
+   end
+   local function clear(f)
+      if self[f] then
+         if torch.isTensor(self[f]) then
+            self[f]:set()
+         elseif type(self[f]) == 'table' then
+            self[f] = {}
+         else
+            self[f] = nil
+         end
+      end
+   end
+   for i,v in ipairs(arg) do clear(v) end
+   return self
+end
+
+table.unpack = table.unpack or unpack
author	Vsevolod Stakhov <vsevolod@highsecure.ru>	2018-05-23 18:14:15 +0100
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>	2018-05-23 18:14:15 +0100
commit	714eb56e1760fdfb26afccde92664d3a2f1e8435 (patch)
tree	84d1399acbb92f852b4bd64f9ea5412680b0c6ab /contrib/lua-torch/nn
parent	220a51ff68013dd668a45b78c60a7b8bfc10f074 (diff)
download	rspamd-714eb56e1760fdfb26afccde92664d3a2f1e8435.tar.gz rspamd-714eb56e1760fdfb26afccde92664d3a2f1e8435.zip