[Project] Remove torch

author: Vsevolod Stakhov <vsevolod@highsecure.ru> 2019-07-01 15:13:04 +0100
committer: Vsevolod Stakhov <vsevolod@highsecure.ru> 2019-07-01 15:13:04 +0100
commit: 891b250b452f8e1963a99931f241ac75e34d0281 (patch)
tree: ab56b822aca3cc6d02a3c9afbe8ca2f6d1c0381f /contrib/lua-torch/torch7/lib
parent: 38691d998d019ac0fba95720c337e3f9badf55c4 (diff)
download: rspamd-891b250b452f8e1963a99931f241ac75e34d0281.tar.gz
rspamd-891b250b452f8e1963a99931f241ac75e34d0281.zip
97 files changed, 0 insertions, 22432 deletions
diff --git a/contrib/lua-torch/torch7/lib/CMakeLists.txt b/contrib/lua-torch/torch7/lib/CMakeLists.txt
deleted file mode 100644
index d6a0e2c9c..000000000
--- a/contrib/lua-torch/torch7/lib/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-SET(TH_INSTALL_BIN_SUBDIR "${BINDIR}")
-SET(TH_INSTALL_LIB_SUBDIR "${RSPAMD_LIBDIR}")
-SET(TH_INSTALL_INCLUDE_SUBDIR "${Torch_INSTALL_INCLUDE_SUBDIR}")
-SET(TH_INSTALL_CMAKE_SUBDIR "${Torch_INSTALL_CMAKE_SUBDIR}")
-
-ADD_SUBDIRECTORY(TH)
-ADD_SUBDIRECTORY(luaT)
diff --git a/contrib/lua-torch/torch7/lib/TH/CMakeLists.txt b/contrib/lua-torch/torch7/lib/TH/CMakeLists.txt
deleted file mode 100644
index f7e0bf9bb..000000000
--- a/contrib/lua-torch/torch7/lib/TH/CMakeLists.txt
+++ /dev/null
@@ -1,296 +0,0 @@
-cmake_minimum_required(VERSION 2.6)
-
-# avoid some cmake warnings
-
-LIST(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
-SET(CMAKE_LIBRARY_PATH /usr/lib/x86_64-linux-gnu/ ${CMAKE_LIBRARY_PATH})
-
-#######################################################################
-##### flags section
-######################################################################
-
-IF(MSVC)
-  # MSVC now supports C99 since VS2013/VS2015, however the standard version switch is not provided yet
-  # SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /std:c99")
-ELSE(MSVC)
-  # enable gnu99 and not c99 because we use
-  # gnu extensions like posix_memalign
-  SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=gnu99")
-ENDIF(MSVC)
-
-IF(MSVC)
-  ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE=1)  # respect the standard
-ENDIF(MSVC)
-SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -w")
-IF(UNIX)
-  # prevent Unknown CMake command "check_function_exists".
-  INCLUDE(CheckFunctionExists)
-ENDIF(UNIX)
-
-# OpenMP support?
-
-IF (WITH_OPENMP)
-  FIND_PACKAGE(OpenMP)
-  IF(OPENMP_FOUND)
-    MESSAGE(STATUS "Compiling with OpenMP support")
-    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-    SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
-  ENDIF(OPENMP_FOUND)
-ENDIF (WITH_OPENMP)
-
-# ARM specific flags
-FIND_PACKAGE(ARM)
-IF (ASIMD_FOUND)
-  MESSAGE(STATUS "asimd/Neon found with compiler flag : -D__NEON__")
-  SET(CMAKE_C_FLAGS "-D__NEON__ ${CMAKE_C_FLAGS}")
-ELSEIF (NEON_FOUND)
-  MESSAGE(STATUS "Neon found with compiler flag : -mfpu=neon -D__NEON__")
-  SET(CMAKE_C_FLAGS "-mfpu=neon -D__NEON__ ${CMAKE_C_FLAGS}")
-ENDIF (ASIMD_FOUND)
-IF (CORTEXA8_FOUND)
-  MESSAGE(STATUS "Cortex-A8 Found with compiler flag : -mcpu=cortex-a8")
-  SET(CMAKE_C_FLAGS "-mcpu=cortex-a8 -fprefetch-loop-arrays ${CMAKE_C_FLAGS}")
-ENDIF (CORTEXA8_FOUND)
-IF (CORTEXA9_FOUND)
-  MESSAGE(STATUS "Cortex-A9 Found with compiler flag : -mcpu=cortex-a9")
-  SET(CMAKE_C_FLAGS "-mcpu=cortex-a9 ${CMAKE_C_FLAGS}")
-ENDIF (CORTEXA9_FOUND)
-
-INCLUDE (CheckIncludeFile)
-INCLUDE (CheckCSourceCompiles)
-CHECK_INCLUDE_FILE(cpuid.h HAVE_CPUID_H)
-# Check for a cpuid intrinsic
-IF(HAVE_CPUID_H)
-    CHECK_C_SOURCE_COMPILES("#include <cpuid.h>
-        int main()
-        {
-            unsigned int eax, ebx, ecx, edx;
-            return __get_cpuid(0, &eax, &ebx, &ecx, &edx);
-        }" HAVE_GCC_GET_CPUID)
-ENDIF()
-IF(HAVE_GCC_GET_CPUID)
-  SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DHAVE_GCC_GET_CPUID")
-ENDIF(HAVE_GCC_GET_CPUID)
-
-CHECK_C_SOURCE_COMPILES("#include <stdint.h>
-    static inline void cpuid(uint32_t *eax, uint32_t *ebx,
-    			 uint32_t *ecx, uint32_t *edx)
-    {
-      uint32_t a = *eax, b, c = *ecx, d;
-      asm volatile ( \"cpuid\" : \"+a\"(a), \"=b\"(b), \"+c\"(c), \"=d\"(d) );
-      *eax = a; *ebx = b; *ecx = c; *edx = d;
-    }
-    int main() {
-      uint32_t a,b,c,d;
-      cpuid(&a, &b, &c, &d);
-      return 0;
-    }" NO_GCC_EBX_FPIC_BUG)
-
-IF(NOT NO_GCC_EBX_FPIC_BUG)
-  SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DUSE_GCC_GET_CPUID")
-ENDIF(NOT NO_GCC_EBX_FPIC_BUG)
-
-
-FIND_PACKAGE(SSE) # checks SSE, AVX and AVX2
-IF(C_SSE2_FOUND)
-  MESSAGE(STATUS "SSE2 Found")
-  SET(CMAKE_C_FLAGS "${C_SSE2_FLAGS} -DUSE_SSE2 ${CMAKE_C_FLAGS}")
-ENDIF(C_SSE2_FOUND)
-IF(C_SSE3_FOUND)
-  MESSAGE(STATUS "SSE3 Found")
-  SET(CMAKE_C_FLAGS "${C_SSE3_FLAGS} -DUSE_SSE3 ${CMAKE_C_FLAGS}")
-ENDIF(C_SSE3_FOUND)
-# we dont set -mavx and -mavx2 flags globally, but only for specific files
-# however, we want to enable the AVX codepaths, so we still need to
-# add USE_AVX and USE_AVX2 macro defines
-IF(FALSE)
-IF(C_AVX_FOUND)
-  MESSAGE(STATUS "AVX Found")
-  SET(CMAKE_C_FLAGS "-DUSE_AVX ${CMAKE_C_FLAGS}")
-ENDIF(C_AVX_FOUND)
-IF(C_AVX2_FOUND)
-  MESSAGE(STATUS "AVX2 Found")
-  SET(CMAKE_C_FLAGS "-DUSE_AVX2 ${CMAKE_C_FLAGS}")
-ENDIF(C_AVX2_FOUND)
-ENDIF()
-
-CHECK_C_SOURCE_RUNS("
-#include <stdatomic.h>
-int main()
-{
-  int a;
-  int oa;
-  atomic_store(&a, 1);
-  atomic_fetch_add(&a, 1);
-  oa = atomic_load(&a);
-  if(!atomic_compare_exchange_strong(&a, &oa, 3))
-    return -1;
-  return 0;
-}
-" HAS_C11_ATOMICS)
-
-IF(NOT HAS_C11_ATOMICS)
-  CHECK_C_SOURCE_RUNS("
-#include <intrin.h>
-int main()
-{
-  long a;
-  _InterlockedExchange(&a, 1);
-  _InterlockedExchangeAdd(&a, 1);
-  if(_InterlockedCompareExchange(&a, 3, 2) != 2)
-    return -1;
-  return 0;
-}
-" HAS_MSC_ATOMICS)
-
-  CHECK_C_SOURCE_RUNS("
-int main()
-{
-  int a;
-  __sync_lock_test_and_set(&a, 1);
-  __sync_fetch_and_add(&a, 1);
-  if(!__sync_bool_compare_and_swap(&a, 2, 3))
-    return -1;
-  return 0;
-}
-" HAS_GCC_ATOMICS)
-ENDIF()
-
-#######################################################################
-##### sources section
-######################################################################
-
-# IF ANY SIMD FOUND
-IF ("${ARCH}" STREQUAL "x86_64")
-  SET(simd generic/simd/convolve.c generic/simd/convolve5x5_sse.c)
-  SET(CMAKE_C_FLAGS "-DUSE_SSE2 ${CMAKE_C_FLAGS}")
-  SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_sse.c PROPERTIES COMPILE_FLAGS "-O3 -ffast-math")
-ENDIF()
-
-
-# IF AVX FOUND
-IF(FALSE)
-IF(C_AVX_FOUND)
-  IF(MSVC)
-    SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_avx.c PROPERTIES COMPILE_FLAGS "/Ox /fp:fast ${C_AVX_FLAGS}")
-    SET_SOURCE_FILES_PROPERTIES(vector/AVX.c PROPERTIES COMPILE_FLAGS "/Ox /arch:AVX ${C_AVX_FLAGS}")
-  ELSE(MSVC)
-    SET_SOURCE_FILES_PROPERTIES(generic/simd/convolve5x5_avx.c PROPERTIES COMPILE_FLAGS "-O3 -ffast-math ${C_AVX_FLAGS}")
-    SET_SOURCE_FILES_PROPERTIES(vector/AVX.c PROPERTIES COMPILE_FLAGS "-O3 ${C_AVX_FLAGS}")
-  ENDIF(MSVC)
-  SET(simd ${simd} vector/AVX.c generic/simd/convolve5x5_avx.c)
-ENDIF(C_AVX_FOUND)
-
-IF(C_AVX2_FOUND)
-  IF(MSVC)
-    SET_SOURCE_FILES_PROPERTIES(vector/AVX2.c PROPERTIES COMPILE_FLAGS "/Ox /arch:AVX2 ${C_AVX2_FLAGS}")
-  ELSE(MSVC)
-    SET_SOURCE_FILES_PROPERTIES(vector/AVX2.c PROPERTIES COMPILE_FLAGS "-O3 ${C_AVX2_FLAGS}")
-  ENDIF(MSVC)
-  SET(simd ${simd} vector/AVX2.c)
-ENDIF(C_AVX2_FOUND)
-ENDIF()
-
-SET(hdr
-  THGeneral.h THHalf.h THAllocator.h THSize.h THStorage.h THTensor.h THTensorApply.h THBlas.h THMath.h
-  THLapack.h THLogAdd.h THRandom.h THVector.h THAtomic.h )
-
-SET(src
-  THGeneral.c THHalf.c THAllocator.c THSize.c THStorage.c THTensor.c THBlas.c THLapack.c
-  THLogAdd.c THRandom.c THFile.c THDiskFile.c THMemoryFile.c THAtomic.c THVector.c)
-
-SET(src ${src} ${hdr} ${simd})
-
-#######################################################################
-##### build section
-######################################################################
-
-ADD_TORCH_LIBRARY(TH SHARED "${src}")
-
-IF(HAS_C11_ATOMICS)
-  ADD_DEFINITIONS(-DUSE_C11_ATOMICS=1)
-  MESSAGE(STATUS "Atomics: using C11 intrinsics")
-ELSEIF(HAS_MSC_ATOMICS)
-  ADD_DEFINITIONS(-DUSE_MSC_ATOMICS=1)
-  MESSAGE(STATUS "Atomics: using MSVC intrinsics")
-ELSEIF(HAS_GCC_ATOMICS)
-  ADD_DEFINITIONS(-DUSE_GCC_ATOMICS=1)
-    MESSAGE(STATUS "Atomics: using GCC intrinsics")
-ELSE()
-  SET(CMAKE_THREAD_PREFER_PTHREAD TRUE)
-  FIND_PACKAGE(Threads)
-  IF(THREADS_FOUND)
-    ADD_DEFINITIONS(-DUSE_PTHREAD_ATOMICS=1)
-    TARGET_LINK_LIBRARIES(TH ${CMAKE_THREAD_LIBS_INIT})
-    MESSAGE(STATUS "Atomics: using pthread")
-  ENDIF()
-ENDIF()
-
-FIND_PACKAGE(BLAS)
-IF(BLAS_FOUND)
-  SET(USE_BLAS 1)
-  TARGET_LINK_LIBRARIES(TH ${BLAS_LIBRARIES})
-  IF(BLAS_INFO STREQUAL "mkl")
-    ADD_DEFINITIONS(-DTH_BLAS_MKL)
-  ELSEIF(BLAS_INFO STREQUAL "open")
-    ADD_DEFINITIONS(-DTH_BLAS_OPEN)
-  ENDIF()
-ENDIF(BLAS_FOUND)
-
-FIND_PACKAGE(LAPACK)
-IF(LAPACK_FOUND)
-  SET(USE_LAPACK 1)
-  TARGET_LINK_LIBRARIES(TH ${LAPACK_LIBRARIES})
-ENDIF(LAPACK_FOUND)
-
-IF (UNIX AND NOT APPLE)
-   INCLUDE(CheckLibraryExists)
-   # https://github.com/libgit2/libgit2/issues/2128#issuecomment-35649830
-   CHECK_LIBRARY_EXISTS(rt clock_gettime "time.h" NEED_LIBRT)
-   IF(NEED_LIBRT)
-     TARGET_LINK_LIBRARIES(TH rt)
-     SET(CMAKE_REQUIRED_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES} rt)
-   ENDIF(NEED_LIBRT)
-ENDIF(UNIX AND NOT APPLE)
-
-IF(UNIX)
-  SET(CMAKE_EXTRA_INCLUDE_FILES "sys/mman.h")
-  CHECK_FUNCTION_EXISTS(mmap HAVE_MMAP)
-  IF(HAVE_MMAP)
-    ADD_DEFINITIONS(-DHAVE_MMAP=1)
-  ENDIF(HAVE_MMAP)
-  # done for lseek: https://www.gnu.org/software/libc/manual/html_node/File-Position-Primitive.html
-  ADD_DEFINITIONS(-D_FILE_OFFSET_BITS=64)
-  CHECK_FUNCTION_EXISTS(shm_open HAVE_SHM_OPEN)
-  IF(HAVE_SHM_OPEN)
-    ADD_DEFINITIONS(-DHAVE_SHM_OPEN=1)
-  ENDIF(HAVE_SHM_OPEN)
-  CHECK_FUNCTION_EXISTS(shm_unlink HAVE_SHM_UNLINK)
-  IF(HAVE_SHM_UNLINK)
-    ADD_DEFINITIONS(-DHAVE_SHM_UNLINK=1)
-  ENDIF(HAVE_SHM_UNLINK)
-  CHECK_FUNCTION_EXISTS(malloc_usable_size HAVE_MALLOC_USABLE_SIZE)
-  IF(HAVE_MALLOC_USABLE_SIZE)
-    ADD_DEFINITIONS(-DHAVE_MALLOC_USABLE_SIZE=1)
-  ENDIF(HAVE_MALLOC_USABLE_SIZE)
-ENDIF(UNIX)
-
-IF(NOT MSVC)
-  TARGET_LINK_LIBRARIES(TH m)
-ENDIF(NOT MSVC)
-
-# Is __thread supported?
-IF(NOT MSVC)
-  CHECK_C_SOURCE_COMPILES("static __thread int x = 1; int main() { return x; }" C_HAS_THREAD)
-ELSE(NOT MSVC)
-  CHECK_C_SOURCE_COMPILES("static __declspec( thread ) int x = 1; int main() { return x; }" C_HAS_THREAD)
-ENDIF(NOT MSVC)
-IF(NOT C_HAS_THREAD)
-  MESSAGE(STATUS "Warning: __thread is not supported, generating thread-unsafe code")
-ELSE(NOT C_HAS_THREAD)
-  SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DTH_HAVE_THREAD")
-ENDIF(NOT C_HAS_THREAD)
-
-INCLUDE_DIRECTORIES("${CMAKE_CURRENT_BINARY_DIR}")
-CONFIGURE_FILE(THGeneral.h.in "${CMAKE_CURRENT_BINARY_DIR}/THGeneral.h")
diff --git a/contrib/lua-torch/torch7/lib/TH/README.md b/contrib/lua-torch/torch7/lib/TH/README.md
deleted file mode 100644
index 4ac26c103..000000000
--- a/contrib/lua-torch/torch7/lib/TH/README.md
+++ /dev/null
@@ -1,11 +0,0 @@
-Environment variables control the disabling of certain explicit SIMD optimizations.
-
-```
-x64 options:
-TH_NO_AVX2=1 # disable AVX2 codepaths
-TH_NO_AVX=1  # disable AVX codepaths
-TH_NO_SSE=1  # disable SSE codepaths
-
-ppc64le options:
-TH_NO_VSX=1  # disable VSX codepaths
-```
diff --git a/contrib/lua-torch/torch7/lib/TH/TH.h b/contrib/lua-torch/torch7/lib/TH/TH.h
deleted file mode 100644
index 11f208c4b..000000000
--- a/contrib/lua-torch/torch7/lib/TH/TH.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef TH_INC
-#define TH_INC
-
-#include "THGeneral.h"
-
-#include "THBlas.h"
-#ifdef USE_LAPACK
-#include "THLapack.h"
-#endif
-
-#include "THAtomic.h"
-#include "THVector.h"
-#include "THLogAdd.h"
-#include "THRandom.h"
-#include "THSize.h"
-#include "THStorage.h"
-#include "THTensor.h"
-#include "THTensorApply.h"
-#include "THTensorDimApply.h"
-
-#include "THFile.h"
-#include "THDiskFile.h"
-#include "THMemoryFile.h"
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THAllocator.c b/contrib/lua-torch/torch7/lib/TH/THAllocator.c
deleted file mode 100644
index 51ac69b94..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THAllocator.c
+++ /dev/null
@@ -1,500 +0,0 @@
-#include "THAllocator.h"
-#include "THAtomic.h"
-
-/* stuff for mapped files */
-#ifdef _WIN32
-#include <windows.h>
-#endif
-
-#if HAVE_MMAP
-#include <sys/types.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#endif
-/* end of stuff for mapped files */
-
-static void *THDefaultAllocator_alloc(void* ctx, ptrdiff_t size) {
-  return THAlloc(size);
-}
-
-static void *THDefaultAllocator_realloc(void* ctx, void* ptr, ptrdiff_t size) {
-  return THRealloc(ptr, size);
-}
-
-static void THDefaultAllocator_free(void* ctx, void* ptr) {
-  THFree(ptr);
-}
-
-THAllocator THDefaultAllocator = {
-  &THDefaultAllocator_alloc,
-  &THDefaultAllocator_realloc,
-  &THDefaultAllocator_free
-};
-
-#if defined(_WIN32) || defined(HAVE_MMAP)
-
-struct THMapAllocatorContext_ {
-  char *filename; /* file name */
-  int flags;
-  ptrdiff_t size; /* mapped size */
-  int fd;
-};
-
-#define TH_ALLOC_ALIGNMENT 64
-
-typedef struct {
-  int refcount;
-} THMapInfo;
-
-char * unknown_filename = "filename not specified";
-
-THMapAllocatorContext *THMapAllocatorContext_new(const char *filename, int flags)
-{
-  THMapAllocatorContext *ctx = THAlloc(sizeof(THMapAllocatorContext));
-
-  if (!(flags & TH_ALLOCATOR_MAPPED_SHARED) && !(flags & TH_ALLOCATOR_MAPPED_SHAREDMEM))
-    flags &= ~TH_ALLOCATOR_MAPPED_NOCREATE;
-  if ((flags ^ TH_ALLOCATOR_MAPPED_EXCLUSIVE) == 0)
-    THError("TH_ALLOCATOR_MAPPED_EXCLUSIVE flag requires opening the file "
-        "in shared mode");
-
-  if (filename) {
-    ctx->filename = THAlloc(strlen(filename)+1);
-    strcpy(ctx->filename, filename);
-  } else {
-    ctx->filename = unknown_filename;
-  }
-  ctx->flags = flags;
-  ctx->size = 0;
-  ctx->fd = -1;
-
-  return ctx;
-}
-
-THMapAllocatorContext *THMapAllocatorContext_newWithFd(const char *filename, int fd, int flags)
-{
-  THMapAllocatorContext *ctx = THMapAllocatorContext_new(filename, flags);
-  ctx->fd = fd;
-
-  return ctx;
-}
-
-char * THMapAllocatorContext_filename(THMapAllocatorContext *ctx)
-{
-  return ctx->filename;
-}
-
-int THMapAllocatorContext_fd(THMapAllocatorContext *ctx)
-{
-  return ctx->fd;
-}
-
-ptrdiff_t THMapAllocatorContext_size(THMapAllocatorContext *ctx)
-{
-  return ctx->size;
-}
-
-void THMapAllocatorContext_free(THMapAllocatorContext *ctx)
-{
-  if (ctx->filename != unknown_filename)
-    THFree(ctx->filename);
-  THFree(ctx);
-}
-
-static void *_map_alloc(void* ctx_, ptrdiff_t size)
-{
-  THMapAllocatorContext *ctx = ctx_;
-  void *data = NULL;
-
-#ifdef _WIN32
-  {
-    HANDLE hfile;
-    HANDLE hmfile;
-    LARGE_INTEGER hfilesz;
-
-    if (ctx->flags & TH_ALLOCATOR_MAPPED_EXCLUSIVE)
-      THError("exclusive file mapping is not supported on Windows");
-    if (ctx->flags & TH_ALLOCATOR_MAPPED_NOCREATE)
-      THError("file mapping without creation is not supported on Windows");
-    if (ctx->flags & TH_ALLOCATOR_MAPPED_KEEPFD)
-      THError("TH_ALLOCATOR_MAPPED_KEEPFD not supported on Windows");
-    if (ctx->flags & TH_ALLOCATOR_MAPPED_FROMFD)
-      THError("TH_ALLOCATOR_MAPPED_FROMFD not supported on Windows");
-
-    /* open file */
-    /* FILE_FLAG_RANDOM_ACCESS ? */
-    if(ctx->flags)
-    {
-      hfile = CreateFileA(ctx->filename, GENERIC_READ|GENERIC_WRITE, FILE_SHARE_WRITE|FILE_SHARE_READ, 0, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, 0);
-      if (hfile == INVALID_HANDLE_VALUE)
-        THError("could not open file <%s> in read-write mode; error code: <%d>", ctx->filename, GetLastError());
-    }
-    else
-    {
-      hfile = CreateFileA(ctx->filename, GENERIC_READ, FILE_SHARE_WRITE|FILE_SHARE_READ, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0);
-      if (hfile == INVALID_HANDLE_VALUE)
-        THError("could not open file <%s> in read-only mode; error code: <%d>", ctx->filename, GetLastError());
-    }
-
-    if (GetFileSizeEx(hfile, &hfilesz) == 0)
-    {
-      THError("could not get file size: <%s>; error code: <%d>", ctx->filename, GetLastError());
-    }
-
-    if(size > 0)
-    {
-      if(size > hfilesz.QuadPart)
-      {
-        if(ctx->flags)
-        {
-          hfilesz.QuadPart = size;
-          if(SetFilePointerEx(hfile, hfilesz, NULL, FILE_BEGIN) == 0)
-          {
-            CloseHandle(hfile);
-            THError("unable to stretch file <%s> to the right size; error code: <%d>", ctx->filename, GetLastError());
-          }
-          if(SetEndOfFile(hfile) == 0)
-          {
-            CloseHandle(hfile);
-            THError("unable to write to file <%s>; error code: <%d>", ctx->filename, GetLastError());
-          }
-        }
-        else
-        {
-          CloseHandle(hfile);
-          THError("file <%s> size is smaller than the required mapping size <%ld>; error code: <%d>", ctx->filename, size, GetLastError());
-        }
-      }
-    }
-    else
-      size = hfilesz.QuadPart;
-
-    ctx->size = size; /* if we are here, it must be the right size */
-
-    hfilesz.QuadPart = ctx->size;
-
-    /* get map handle */
-    if(ctx->flags)
-    {
-      if( (hmfile = CreateFileMapping(hfile, NULL, PAGE_READWRITE, hfilesz.HighPart, hfilesz.LowPart, NULL)) == NULL )
-        THError("could not create a map on file <%s>; error code: <%d>", ctx->filename, GetLastError());
-    }
-    else
-    {
-      if( (hmfile = CreateFileMapping(hfile, NULL, PAGE_WRITECOPY, hfilesz.HighPart, hfilesz.LowPart, NULL)) == NULL )
-        THError("could not create a map on file <%s>; error code: <%d>", ctx->filename, GetLastError());
-    }
-
-    /* map the stuff */
-    if(ctx->flags)
-      data = MapViewOfFile(hmfile, FILE_MAP_ALL_ACCESS, 0, 0, 0);
-    else
-      data = MapViewOfFile(hmfile, FILE_MAP_COPY, 0, 0, 0);
-
-    CloseHandle(hfile);
-    CloseHandle(hmfile);
-  }
-#else /* _WIN32 */
-  {
-    /* open file */
-    int fd;
-    int flags;
-    struct stat file_stat;
-
-    if (ctx->flags & (TH_ALLOCATOR_MAPPED_SHARED | TH_ALLOCATOR_MAPPED_SHAREDMEM))
-      flags = O_RDWR | O_CREAT;
-    else
-      flags = O_RDONLY;
-
-    if (ctx->flags & TH_ALLOCATOR_MAPPED_EXCLUSIVE)
-      flags |= O_EXCL;
-    if (ctx->flags & TH_ALLOCATOR_MAPPED_NOCREATE)
-      flags &= ~O_CREAT;
-
-    if (!(ctx->flags & TH_ALLOCATOR_MAPPED_FROMFD)) {
-      if(ctx->flags & TH_ALLOCATOR_MAPPED_SHARED)
-      {
-        if((fd = open(ctx->filename, flags, (mode_t)0600)) == -1)
-          THError("unable to open file <%s> in read-write mode", ctx->filename);
-      }
-      else if (ctx->flags & TH_ALLOCATOR_MAPPED_SHAREDMEM)
-      {
-#ifdef HAVE_SHM_OPEN
-        if((fd = shm_open(ctx->filename, flags, (mode_t)0600)) == -1)
-          THError("unable to open shared memory object <%s> in read-write mode", ctx->filename);
-#else
-        THError("unable to open file <%s> in sharedmem mode, shm_open unavailable on this platform", ctx->filename);
-#endif
-      }
-      else
-      {
-        if((fd = open(ctx->filename, O_RDONLY)) == -1)
-          THError("unable to open file <%s> in read-only mode", ctx->filename);
-      }
-    } else {
-      fd = ctx->fd;
-    }
-
-    if(fstat(fd, &file_stat) == -1)
-    {
-      if (!(ctx->flags & TH_ALLOCATOR_MAPPED_FROMFD))
-        close(fd);
-      THError("unable to stat the file <%s>", ctx->filename);
-    }
-
-    if(size > 0)
-    {
-      if(size > file_stat.st_size)
-      {
-        if(ctx->flags)
-        {
-          if(ftruncate(fd, size) == -1)
-            THError("unable to resize file <%s> to the right size", ctx->filename);
-          if(fstat(fd, &file_stat) == -1 || file_stat.st_size < size)
-          {
-            close(fd);
-            THError("unable to stretch file <%s> to the right size", ctx->filename);
-          }
-/* on OS X write returns with errno 45 (Opperation not supported) when used
- * with a file descriptor obtained via shm_open
- */
-#ifndef __APPLE__
-          if((write(fd, "", 1)) != 1) /* note that the string "" contains the '\0' byte ... */
-          {
-            close(fd);
-            THError("unable to write to file <%s>", ctx->filename);
-          }
-#endif
-        }
-        else
-        {
-          close(fd);
-          THError("file <%s> size is smaller than the required mapping size <%ld>", ctx->filename, size);
-        }
-      }
-    }
-    else
-      size = file_stat.st_size;
-
-    ctx->size = size; /* if we are here, it must be the right size */
-
-    /* map it */
-    if (ctx->flags & (TH_ALLOCATOR_MAPPED_SHARED | TH_ALLOCATOR_MAPPED_SHAREDMEM))
-      data = mmap(NULL, ctx->size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
-    else
-      data = mmap(NULL, ctx->size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
-
-    if (ctx->flags & TH_ALLOCATOR_MAPPED_KEEPFD) {
-      ctx->fd = fd;
-    } else {
-      if(close(fd) == -1)
-        THError("Error closing file <%s>", ctx->filename);
-      ctx->fd = -1;
-    }
-
-    if (ctx->flags & TH_ALLOCATOR_MAPPED_UNLINK) {
-      if (ctx->flags & TH_ALLOCATOR_MAPPED_SHAREDMEM)
-      {
-#ifdef HAVE_SHM_UNLINK
-        if (shm_unlink(ctx->filename) == -1)
-          THError("could not unlink the shared memory file %s", ctx->filename);
-#else
-        THError("could not unlink the shared memory file %s, shm_unlink not available on platform", ctx->filename);
-#endif
-      }
-      else
-      {
-        if (unlink(ctx->filename) == -1)
-          THError("could not unlink file %s", ctx->filename);
-      }
-    }
-
-    if(data == MAP_FAILED)
-    {
-      data = NULL; /* let's be sure it is NULL */
-      THError("$ Torch: unable to mmap memory: you tried to mmap %dGB.", ctx->size/1073741824);
-    }
-  }
-#endif
-
-  return data;
-}
-
-static void * THMapAllocator_alloc(void *ctx, ptrdiff_t size) {
-  return _map_alloc(ctx, size);
-}
-
-static void *THMapAllocator_realloc(void* ctx, void* ptr, ptrdiff_t size) {
-  THError("cannot realloc mapped data");
-  return NULL;
-}
-
-static void THMapAllocator_free(void* ctx_, void* data) {
-  THMapAllocatorContext *ctx = ctx_;
-
-#ifdef _WIN32
-  if(UnmapViewOfFile(data) == 0)
-    THError("could not unmap the shared memory file");
-#else /* _WIN32 */
-  if (ctx->flags & TH_ALLOCATOR_MAPPED_KEEPFD) {
-    if (close(ctx->fd) == -1)
-      THError("could not close file descriptor %d", ctx->fd);
-  }
-
-  if (munmap(data, ctx->size))
-    THError("could not unmap the shared memory file");
-
-  if (!(ctx->flags & (TH_ALLOCATOR_MAPPED_FROMFD | TH_ALLOCATOR_MAPPED_UNLINK)))
-  {
-    if (ctx->flags & TH_ALLOCATOR_MAPPED_SHAREDMEM)
-    {
-#ifdef HAVE_SHM_UNLINK
-      if (shm_unlink(ctx->filename) == -1)
-        THError("could not unlink the shared memory file %s", ctx->filename);
-#else
-      THError("could not unlink the shared memory file %s, shm_unlink not available on platform", ctx->filename);
-#endif
-    }
-  }
-#endif /* _WIN32 */
-
-  THMapAllocatorContext_free(ctx);
-}
-
-#else
-
-THMapAllocatorContext *THMapAllocatorContext_new(const char *filename, int flags) {
-  THError("file mapping not supported on your system");
-  return NULL;
-}
-
-void THMapAllocatorContext_free(THMapAllocatorContext *ctx) {
-  THError("file mapping not supported on your system");
-}
-
-static void *THMapAllocator_alloc(void* ctx_, ptrdiff_t size) {
-  THError("file mapping not supported on your system");
-  return NULL;
-}
-
-static void *THMapAllocator_realloc(void* ctx, void* ptr, ptrdiff_t size) {
-  THError("file mapping not supported on your system");
-  return NULL;
-}
-
-static void THMapAllocator_free(void* ctx, void* data) {
-  THError("file mapping not supported on your system");
-}
-
-#endif
-
-#if (defined(_WIN32) || defined(HAVE_MMAP)) && defined(TH_ATOMIC_IPC_REFCOUNT)
-
-static void * THRefcountedMapAllocator_alloc(void *_ctx, ptrdiff_t size) {
-  THMapAllocatorContext *ctx = _ctx;
-
-  if (ctx->flags & TH_ALLOCATOR_MAPPED_FROMFD)
-    THError("THRefcountedMapAllocator doesn't support TH_ALLOCATOR_MAPPED_FROMFD flag");
-  if (ctx->flags & TH_ALLOCATOR_MAPPED_KEEPFD)
-    THError("THRefcountedMapAllocator doesn't support TH_ALLOCATOR_MAPPED_KEEPFD flag");
-  if (ctx->flags & TH_ALLOCATOR_MAPPED_UNLINK)
-    THError("THRefcountedMapAllocator doesn't support TH_ALLOCATOR_MAPPED_UNLINK flag");
-  if (!(ctx->flags & TH_ALLOCATOR_MAPPED_SHAREDMEM))
-    THError("THRefcountedMapAllocator requires TH_ALLOCATOR_MAPPED_SHAREDMEM flag");
-
-  size = size + TH_ALLOC_ALIGNMENT;
-  void *ptr = _map_alloc(ctx, size);
-  char *data = ((char*)ptr) + TH_ALLOC_ALIGNMENT;
-  THMapInfo *map_info = (THMapInfo*)ptr;
-
-  if (ctx->flags & TH_ALLOCATOR_MAPPED_EXCLUSIVE)
-    map_info->refcount = 1;
-  else
-    THAtomicIncrementRef(&map_info->refcount);
-
-  return (void*)data;
-}
-
-static void *THRefcountedMapAllocator_realloc(void* ctx, void* ptr, ptrdiff_t size) {
-  THError("cannot realloc mapped data");
-  return NULL;
-}
-
-static void THRefcountedMapAllocator_free(void* ctx_, void* data) {
-  THMapAllocatorContext *ctx = ctx_;
-
-#ifdef _WIN32
-  if(UnmapViewOfFile(data) == 0)
-    THError("could not unmap the shared memory file");
-#else /* _WIN32 */
-
-  THMapInfo *info = (THMapInfo*)(((char*)data) - TH_ALLOC_ALIGNMENT);
-  if (THAtomicDecrementRef(&info->refcount)) {
-#ifdef HAVE_SHM_UNLINK
-    if (shm_unlink(ctx->filename) == -1)
-      THError("could not unlink the shared memory file %s", ctx->filename);
-#else
-    THError("could not unlink the shared memory file %s, shm_unlink not available on platform", ctx->filename);
-#endif /* HAVE_SHM_UNLINK */
-  }
-  if (munmap(info, ctx->size))
-    THError("could not unmap the shared memory file %s", ctx->filename);
-#endif /* _WIN32 */
-
-  THMapAllocatorContext_free(ctx);
-}
-
-void THRefcountedMapAllocator_incref(THMapAllocatorContext *ctx, void *data)
-{
-  THMapInfo *map_info = (THMapInfo*)(((char*)data) - TH_ALLOC_ALIGNMENT);
-  THAtomicIncrementRef(&map_info->refcount);
-}
-
-int THRefcountedMapAllocator_decref(THMapAllocatorContext *ctx, void *data)
-{
-  THMapInfo *map_info = (THMapInfo*)(((char*)data) - TH_ALLOC_ALIGNMENT);
-  return THAtomicDecrementRef(&map_info->refcount);
-}
-
-#else
-
-static void * THRefcountedMapAllocator_alloc(void *ctx, ptrdiff_t size) {
-  THError("refcounted file mapping not supported on your system");
-  return NULL;
-}
-
-static void *THRefcountedMapAllocator_realloc(void* ctx, void* ptr, ptrdiff_t size) {
-  THError("refcounted file mapping not supported on your system");
-  return NULL;
-}
-
-static void THRefcountedMapAllocator_free(void* ctx_, void* data) {
-  THError("refcounted file mapping not supported on your system");
-}
-
-void THRefcountedMapAllocator_incref(THMapAllocatorContext *ctx, void *data)
-{
-  THError("refcounted file mapping not supported on your system");
-}
-
-int THRefcountedMapAllocator_decref(THMapAllocatorContext *ctx, void *data)
-{
-  THError("refcounted file mapping not supported on your system");
-  return 0;
-}
-
-#endif
-
-THAllocator THMapAllocator = {
-  &THMapAllocator_alloc,
-  &THMapAllocator_realloc,
-  &THMapAllocator_free
-};
-
-THAllocator THRefcountedMapAllocator = {
-  &THRefcountedMapAllocator_alloc,
-  &THRefcountedMapAllocator_realloc,
-  &THRefcountedMapAllocator_free
-};
diff --git a/contrib/lua-torch/torch7/lib/TH/THAllocator.h b/contrib/lua-torch/torch7/lib/TH/THAllocator.h
deleted file mode 100644
index 18fc9ec0a..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THAllocator.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef TH_ALLOCATOR_INC
-#define TH_ALLOCATOR_INC
-
-#include "THGeneral.h"
-
-#define TH_ALLOCATOR_MAPPED_SHARED 1
-#define TH_ALLOCATOR_MAPPED_SHAREDMEM 2
-#define TH_ALLOCATOR_MAPPED_EXCLUSIVE 4
-#define TH_ALLOCATOR_MAPPED_NOCREATE 8
-#define TH_ALLOCATOR_MAPPED_KEEPFD 16
-#define TH_ALLOCATOR_MAPPED_FROMFD 32
-#define TH_ALLOCATOR_MAPPED_UNLINK 64
-
-/* Custom allocator
- */
-typedef struct THAllocator {
-  void* (*malloc)(void*, ptrdiff_t);
-  void* (*realloc)(void*, void*, ptrdiff_t);
-  void (*free)(void*, void*);
-} THAllocator;
-
-/* default malloc/free allocator. malloc and realloc raise an error (using
- * THError) on allocation failure.
- */
-extern THAllocator THDefaultAllocator;
-
-/* file map allocator
- */
-typedef struct THMapAllocatorContext_  THMapAllocatorContext;
-TH_API THMapAllocatorContext *THMapAllocatorContext_new(const char *filename, int flags);
-TH_API THMapAllocatorContext *THMapAllocatorContext_newWithFd(const char *filename,
-    int fd, int flags);
-TH_API char * THMapAllocatorContext_filename(THMapAllocatorContext *ctx);
-TH_API int THMapAllocatorContext_fd(THMapAllocatorContext *ctx);
-TH_API ptrdiff_t THMapAllocatorContext_size(THMapAllocatorContext *ctx);
-TH_API void THMapAllocatorContext_free(THMapAllocatorContext *ctx);
-TH_API void THRefcountedMapAllocator_incref(THMapAllocatorContext *ctx, void *data);
-TH_API int THRefcountedMapAllocator_decref(THMapAllocatorContext *ctx, void *data);
-
-extern THAllocator THMapAllocator;
-extern THAllocator THRefcountedMapAllocator;
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THAtomic.c b/contrib/lua-torch/torch7/lib/TH/THAtomic.c
deleted file mode 100644
index 714fc52db..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THAtomic.c
+++ /dev/null
@@ -1,267 +0,0 @@
-#include "THAtomic.h"
-
-/*
-  Note: I thank Leon Bottou for his useful comments.
-  Ronan.
-*/
-
-#if defined(USE_C11_ATOMICS)
-#include <stdatomic.h>
-#endif
-
-#if defined(USE_MSC_ATOMICS)
-#include <intrin.h>
-#include <assert.h>
-#endif
-
-#if !defined(USE_MSC_ATOMICS) && !defined(USE_GCC_ATOMICS) && defined(USE_PTHREAD_ATOMICS)
-#include <pthread.h>
-static pthread_mutex_t ptm = PTHREAD_MUTEX_INITIALIZER;
-#endif
-
-void THAtomicSet(int volatile *a, int newvalue)
-{
-#if defined(USE_C11_ATOMICS)
-  atomic_store(a, newvalue);
-#elif defined(USE_MSC_ATOMICS)
-  assert(sizeof(int) == sizeof(long));
-  _InterlockedExchange((long*)a, newvalue);
-#elif defined(USE_GCC_ATOMICS)
-  __sync_lock_test_and_set(a, newvalue);
-#else
-  int oldvalue;
-  do {
-    oldvalue = *a;
-  } while (!THAtomicCompareAndSwap(a, oldvalue, newvalue));
-#endif
-}
-
-int THAtomicGet(int volatile *a)
-{
-#if defined(USE_C11_ATOMICS)
-  return atomic_load(a);
-#else
-  int value;
-  do {
-    value = *a;
-  } while (!THAtomicCompareAndSwap(a, value, value));
-  return value;
-#endif
-}
-
-int THAtomicAdd(int volatile *a, int value)
-{
-#if defined(USE_C11_ATOMICS)
-  return atomic_fetch_add(a, value);
-#elif defined(USE_MSC_ATOMICS)
-  assert(sizeof(int) == sizeof(long));
-  return _InterlockedExchangeAdd((long*)a, value);
-#elif defined(USE_GCC_ATOMICS)
-  return __sync_fetch_and_add(a, value);
-#else
-  int oldvalue;
-  do {
-    oldvalue = *a;
-  } while (!THAtomicCompareAndSwap(a, oldvalue, (oldvalue + value)));
-  return oldvalue;
-#endif
-}
-
-void THAtomicIncrementRef(int volatile *a)
-{
-  THAtomicAdd(a, 1);
-}
-
-int THAtomicDecrementRef(int volatile *a)
-{
-  return (THAtomicAdd(a, -1) == 1);
-}
-
-int THAtomicCompareAndSwap(int volatile *a, int oldvalue, int newvalue)
-{
-#if defined(USE_C11_ATOMICS)
-  return atomic_compare_exchange_strong(a, &oldvalue, newvalue);
-#elif defined(USE_MSC_ATOMICS)
-  assert(sizeof(int) == sizeof(long));
-  return (_InterlockedCompareExchange((long*)a, (long)newvalue, (long)oldvalue) == (long)oldvalue);
-#elif defined(USE_GCC_ATOMICS)
-  return __sync_bool_compare_and_swap(a, oldvalue, newvalue);
-#elif defined(USE_PTHREAD_ATOMICS)
-  int ret = 0;
-  pthread_mutex_lock(&ptm);
-  if(*a == oldvalue) {
-    *a = newvalue;
-    ret = 1;
-  }
-  pthread_mutex_unlock(&ptm);
-  return ret;
-#else
-#warning THAtomic is not thread safe
-  if(*a == oldvalue) {
-    *a = newvalue;
-    return 1;
-  }
-  else
-    return 0;
-#endif
-}
-
-void THAtomicSetLong(long volatile *a, long newvalue)
-{
-#if defined(USE_C11_ATOMICS)
-  atomic_store(a, newvalue);
-#elif defined(USE_MSC_ATOMICS)
-  _InterlockedExchange(a, newvalue);
-#elif defined(USE_GCC_ATOMICS)
-  __sync_lock_test_and_set(a, newvalue);
-#else
-  long oldvalue;
-  do {
-    oldvalue = *a;
-  } while (!THAtomicCompareAndSwapLong(a, oldvalue, newvalue));
-#endif
-}
-
-long THAtomicGetLong(long volatile *a)
-{
-#if defined(USE_C11_ATOMICS)
-  return atomic_load(a);
-#else
-  long value;
-  do {
-    value = *a;
-  } while (!THAtomicCompareAndSwapLong(a, value, value));
-  return value;
-#endif
-}
-
-long THAtomicAddLong(long volatile *a, long value)
-{
-#if defined(USE_C11_ATOMICS)
-  return atomic_fetch_add(a, value);
-#elif defined(USE_MSC_ATOMICS)
-  return _InterlockedExchangeAdd(a, value);
-#elif defined(USE_GCC_ATOMICS)
-  return __sync_fetch_and_add(a, value);
-#else
-  long oldvalue;
-  do {
-    oldvalue = *a;
-  } while (!THAtomicCompareAndSwapLong(a, oldvalue, (oldvalue + value)));
-  return oldvalue;
-#endif
-}
-
-long THAtomicCompareAndSwapLong(long volatile *a, long oldvalue, long newvalue)
-{
-#if defined(USE_C11_ATOMICS)
-  return atomic_compare_exchange_strong(a, &oldvalue, newvalue);
-#elif defined(USE_MSC_ATOMICS)
-  return (_InterlockedCompareExchange(a, newvalue, oldvalue) == oldvalue);
-#elif defined(USE_GCC_ATOMICS)
-  return __sync_bool_compare_and_swap(a, oldvalue, newvalue);
-#elif defined(USE_PTHREAD_ATOMICS)
-  long ret = 0;
-  pthread_mutex_lock(&ptm);
-  if(*a == oldvalue) {
-    *a = newvalue;
-    ret = 1;
-  }
-  pthread_mutex_unlock(&ptm);
-  return ret;
-#else
-#warning THAtomic is not thread safe
-  if(*a == oldvalue) {
-    *a = newvalue;
-    return 1;
-  }
-  else
-    return 0;
-#endif
-}
-
-void THAtomicSetPtrdiff(ptrdiff_t volatile *a, ptrdiff_t newvalue)
-{
-#if defined(USE_C11_ATOMICS)
-  atomic_store(a, newvalue);
-#elif defined(USE_MSC_ATOMICS)
-#ifdef _WIN64
-  _InterlockedExchange64(a, newvalue);
-#else
-  _InterlockedExchange(a, newvalue);
-#endif
-#elif defined(USE_GCC_ATOMICS)
-  __sync_lock_test_and_set(a, newvalue);
-#else
-  ptrdiff_t oldvalue;
-  do {
-    oldvalue = *a;
-  } while (!THAtomicCompareAndSwapPtrdiff(a, oldvalue, newvalue));
-#endif
-}
-
-ptrdiff_t THAtomicGetPtrdiff(ptrdiff_t volatile *a)
-{
-#if defined(USE_C11_ATOMICS)
-  return atomic_load(a);
-#else
-  ptrdiff_t value;
-  do {
-    value = *a;
-  } while (!THAtomicCompareAndSwapPtrdiff(a, value, value));
-  return value;
-#endif
-}
-
-ptrdiff_t THAtomicAddPtrdiff(ptrdiff_t volatile *a, ptrdiff_t value)
-{
-#if defined(USE_C11_ATOMICS)
-  return atomic_fetch_add(a, value);
-#elif defined(USE_MSC_ATOMICS)
-#ifdef _WIN64
-  return _InterlockedExchangeAdd64(a, value);
-#else
-  return _InterlockedExchangeAdd(a, value);
-#endif
-#elif defined(USE_GCC_ATOMICS)
-  return __sync_fetch_and_add(a, value);
-#else
-  ptrdiff_t oldvalue;
-  do {
-    oldvalue = *a;
-  } while (!THAtomicCompareAndSwapPtrdiff(a, oldvalue, (oldvalue + value)));
-  return oldvalue;
-#endif
-}
-
-ptrdiff_t THAtomicCompareAndSwapPtrdiff(ptrdiff_t volatile *a, ptrdiff_t oldvalue, ptrdiff_t newvalue)
-{
-#if defined(USE_C11_ATOMICS)
-  return atomic_compare_exchange_strong(a, &oldvalue, newvalue);
-#elif defined(USE_MSC_ATOMICS)
-#ifdef _WIN64
-  return (_InterlockedCompareExchange64(a, newvalue, oldvalue) == oldvalue);
-#else
-  return (_InterlockedCompareExchange(a, newvalue, oldvalue) == oldvalue);
-#endif
-#elif defined(USE_GCC_ATOMICS)
-  return __sync_bool_compare_and_swap(a, oldvalue, newvalue);
-#elif defined(USE_PTHREAD_ATOMICS)
-  ptrdiff_t ret = 0;
-  pthread_mutex_lock(&ptm);
-  if(*a == oldvalue) {
-    *a = newvalue;
-    ret = 1;
-  }
-  pthread_mutex_unlock(&ptm);
-  return ret;
-#else
-#warning THAtomic is not thread safe
-  if(*a == oldvalue) {
-    *a = newvalue;
-    return 1;
-  }
-  else
-    return 0;
-#endif
-}
diff --git a/contrib/lua-torch/torch7/lib/TH/THAtomic.h b/contrib/lua-torch/torch7/lib/TH/THAtomic.h
deleted file mode 100644
index d77b20b24..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THAtomic.h
+++ /dev/null
@@ -1,125 +0,0 @@
-#ifndef TH_ATOMIC_INC
-#define TH_ATOMIC_INC
-
-#include "THGeneral.h"
-
-/******************************************************************************
- * Atomic operations for TH
- *  Five backends are integrated:
- *  - C11 atomic operations
- *  - MSVC intrinsics
- *  - GCC intrinsics
- *  - Pthread if none of the above is available
- *  - Unsafe mode in none of the above is available
- ******************************************************************************/
-
-
-/******************************************************************************
- * all-purpose functions
- ******************************************************************************/
-
-/*
- * *a = newvalue
-*/
-TH_API void THAtomicSet(int volatile *a, int newvalue);
-
-/*
- * return *a
-*/
-TH_API int THAtomicGet(int volatile *a);
-
-/*
- * *a += value,
- * return previous *a
-*/
-TH_API int THAtomicAdd(int volatile *a, int value);
-
-/*
- * check if (*a == oldvalue)
- * if true: set *a to newvalue, return 1
- * if false: return 0
-*/
-TH_API int THAtomicCompareAndSwap(int volatile *a, int oldvalue, int newvalue);
-
-
-/******************************************************************************
- * refcounting functions
- ******************************************************************************/
-
-/*
- * *a++
-*/
-TH_API void THAtomicIncrementRef(int volatile *a);
-
-/*
- * *a--,
- * return 1 if *a == 0 after the operation, 0 otherwise
-*/
-TH_API int THAtomicDecrementRef(int volatile *a);
-
-
-
-/******************************************************************************
- * functions for long type
- ******************************************************************************/
-
-/*
- * *a = newvalue
-*/
-TH_API void THAtomicSetLong(long volatile *a, long newvalue);
-
-/*
- * return *a
-*/
-TH_API long THAtomicGetLong(long volatile *a);
-
-/*
- * *a += value,
- * return previous *a
-*/
-TH_API long THAtomicAddLong(long volatile *a, long value);
-
-/*
- * check if (*a == oldvalue)
- * if true: set *a to newvalue, return 1
- * if false: return 0
-*/
-TH_API long THAtomicCompareAndSwapLong(long volatile *a, long oldvalue, long newvalue);
-
-
-
-/******************************************************************************
- * functions for ptrdiff_t type
- ******************************************************************************/
-
-/*
- * *a = newvalue
-*/
-TH_API void THAtomicSetPtrdiff(ptrdiff_t volatile *a, ptrdiff_t newvalue);
-
-/*
- * return *a
-*/
-TH_API ptrdiff_t THAtomicGetPtrdiff(ptrdiff_t volatile *a);
-
-/*
- * *a += value,
- * return previous *a
-*/
-TH_API ptrdiff_t THAtomicAddPtrdiff(ptrdiff_t volatile *a, ptrdiff_t value);
-
-/*
- * check if (*a == oldvalue)
- * if true: set *a to newvalue, return 1
- * if false: return 0
-*/
-TH_API ptrdiff_t THAtomicCompareAndSwapPtrdiff(ptrdiff_t volatile *a, ptrdiff_t oldvalue, ptrdiff_t newvalue);
-
-#if defined(USE_C11_ATOMICS) && defined(ATOMIC_INT_LOCK_FREE) && \
-  ATOMIC_INT_LOCK_FREE == 2
-#define TH_ATOMIC_IPC_REFCOUNT 1
-#elif defined(USE_MSC_ATOMICS) || defined(USE_GCC_ATOMICS)
-#define TH_ATOMIC_IPC_REFCOUNT 1
-#endif
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THBlas.c b/contrib/lua-torch/torch7/lib/TH/THBlas.c
deleted file mode 100644
index 35618b26a..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THBlas.c
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "THBlas.h"
-
-#include "generic/THBlas.c"
-#include "THGenerateAllTypes.h"
diff --git a/contrib/lua-torch/torch7/lib/TH/THBlas.h b/contrib/lua-torch/torch7/lib/TH/THBlas.h
deleted file mode 100644
index 5fef0febc..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THBlas.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef TH_BLAS_INC
-#define TH_BLAS_INC
-
-#include "THGeneral.h"
-
-#define THBlas_(NAME) TH_CONCAT_4(TH,Real,Blas_,NAME)
-
-#include "generic/THBlas.h"
-#include "THGenerateAllTypes.h"
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THConfig.cmake.in b/contrib/lua-torch/torch7/lib/TH/THConfig.cmake.in
deleted file mode 100644
index 306cd878b..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THConfig.cmake.in
+++ /dev/null
@@ -1,9 +0,0 @@
-# Find the TH includes and library
-#
-# TH_INCLUDE_DIR -- where to find the includes
-# TH_LIBRARIES -- list of libraries to link against
-# TH_FOUND -- set to 1 if found
-
-SET(TH_FOUND 1)
-SET(TH_INCLUDE_DIR "@TH_INCLUDE_DIR@")
-SET(TH_LIBRARIES "@TH_LIBRARIES@")
diff --git a/contrib/lua-torch/torch7/lib/TH/THDiskFile.c b/contrib/lua-torch/torch7/lib/TH/THDiskFile.c
deleted file mode 100644
index 3f57b3b35..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THDiskFile.c
+++ /dev/null
@@ -1,797 +0,0 @@
-#include "THGeneral.h"
-#include "THDiskFile.h"
-#include "THFilePrivate.h"
-
-#include <stdint.h>
-#ifndef LLONG_MAX
-#define LLONG_MAX 9223372036854775807LL
-#endif
-
-typedef struct THDiskFile__
-{
-    THFile file;
-
-    FILE *handle;
-    char *name;
-    int isNativeEncoding;
-    int longSize;
-
-} THDiskFile;
-
-static int THDiskFile_isOpened(THFile *self)
-{
-  THDiskFile *dfself = (THDiskFile*)self;
-  return (dfself->handle != NULL);
-}
-
-const char *THDiskFile_name(THFile *self)
-{
-  THDiskFile *dfself = (THDiskFile*)self;
-  return dfself->name;
-}
-
-/* workaround mac osx lion ***insane*** fread bug */
-#ifdef __APPLE__
-size_t fread__(void *ptr, size_t size, size_t nitems, FILE *stream)
-{
-  size_t nread = 0;
-  while(!feof(stream) && !ferror(stream) && (nread < nitems))
-    nread += fread((char*)ptr+nread*size, size, THMin(2147483648/size, nitems-nread), stream);
-  return nread;
-}
-#else
-#define fread__ fread
-#endif
-
-#define READ_WRITE_METHODS(TYPE, TYPEC, ASCII_READ_ELEM, ASCII_WRITE_ELEM) \
-  static size_t THDiskFile_read##TYPEC(THFile *self, TYPE *data, size_t n)  \
-  {                                                                     \
-    THDiskFile *dfself = (THDiskFile*)(self);                           \
-    size_t nread = 0L;                                                    \
-                                                                        \
-    THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file"); \
-    THArgCheck(dfself->file.isReadable, 1, "attempt to read in a write-only file"); \
-                                                                        \
-    if(dfself->file.isBinary)                                           \
-    {                                                                   \
-      nread = fread__(data, sizeof(TYPE), n, dfself->handle);           \
-      if(!dfself->isNativeEncoding && (sizeof(TYPE) > 1) && (nread > 0)) \
-        THDiskFile_reverseMemory(data, data, sizeof(TYPE), nread);      \
-    }                                                                   \
-    else                                                                \
-    {                                                                   \
-      size_t i;                                                           \
-      for(i = 0; i < n; i++)                                            \
-      {                                                                 \
-        ASCII_READ_ELEM; /* increment here result and break if wrong */ \
-      }                                                                 \
-      if(dfself->file.isAutoSpacing && (n > 0))                         \
-      {                                                                 \
-        int c = fgetc(dfself->handle);                                  \
-        if( (c != '\n') && (c != EOF) )                                 \
-          ungetc(c, dfself->handle);                                    \
-      }                                                                 \
-    }                                                                   \
-                                                                        \
-    if(nread != n)                                                      \
-    {                                                                   \
-      dfself->file.hasError = 1; /* shouldn't we put hasError to 0 all the time ? */ \
-      if(!dfself->file.isQuiet)                                         \
-        THError("read error: read %d blocks instead of %d", nread, n);  \
-    }                                                                   \
-                                                                        \
-    return nread;                                                       \
-  }                                                                     \
-                                                                        \
-  static size_t THDiskFile_write##TYPEC(THFile *self, TYPE *data, size_t n) \
-  {                                                                     \
-    THDiskFile *dfself = (THDiskFile*)(self);                           \
-    size_t nwrite = 0L;                                                   \
-                                                                        \
-    THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file"); \
-    THArgCheck(dfself->file.isWritable, 1, "attempt to write in a read-only file"); \
-                                                                        \
-    if(dfself->file.isBinary)                                           \
-    {                                                                   \
-      if(dfself->isNativeEncoding)                                      \
-      {                                                                 \
-        nwrite = fwrite(data, sizeof(TYPE), n, dfself->handle);         \
-      }                                                                 \
-      else                                                              \
-      {                                                                 \
-        if(sizeof(TYPE) > 1)                                            \
-        {                                                               \
-          char *buffer = THAlloc(sizeof(TYPE)*n);                       \
-          THDiskFile_reverseMemory(buffer, data, sizeof(TYPE), n);      \
-          nwrite = fwrite(buffer, sizeof(TYPE), n, dfself->handle);     \
-          THFree(buffer);                                               \
-        }                                                               \
-        else                                                            \
-          nwrite = fwrite(data, sizeof(TYPE), n, dfself->handle);       \
-      }                                                                 \
-    }                                                                   \
-    else                                                                \
-    {                                                                   \
-      size_t i;                                                           \
-      for(i = 0; i < n; i++)                                            \
-      {                                                                 \
-        ASCII_WRITE_ELEM;                                               \
-        if( dfself->file.isAutoSpacing && (i < n-1) )                   \
-          fprintf(dfself->handle, " ");                                 \
-      }                                                                 \
-      if(dfself->file.isAutoSpacing && (n > 0))                         \
-        fprintf(dfself->handle, "\n");                                  \
-    }                                                                   \
-                                                                        \
-    if(nwrite != n)                                                     \
-    {                                                                   \
-      dfself->file.hasError = 1;                                        \
-      if(!dfself->file.isQuiet)                                         \
-        THError("write error: wrote %d blocks instead of %d", nwrite, n); \
-    }                                                                   \
-                                                                        \
-    return nwrite;                                                      \
-}
-
-static int THDiskFile_mode(const char *mode, int *isReadable, int *isWritable)
-{
-  *isReadable = 0;
-  *isWritable = 0;
-  if(strlen(mode) == 1)
-  {
-    if(*mode == 'r')
-    {
-      *isReadable = 1;
-      return 1;
-    }
-    else if(*mode == 'w')
-    {
-      *isWritable = 1;
-      return 1;
-    }
-  }
-  else if(strlen(mode) == 2)
-  {
-    if(mode[0] == 'r' && mode[1] == 'w')
-    {
-      *isReadable = 1;
-      *isWritable = 1;
-      return 1;
-    }
-  }
-  return 0;
-}
-
-static void THDiskFile_synchronize(THFile *self)
-{
-  THDiskFile *dfself = (THDiskFile*)(self);
-  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
-  fflush(dfself->handle);
-}
-
-static void THDiskFile_seek(THFile *self, size_t position)
-{
-  THDiskFile *dfself = (THDiskFile*)(self);
-
-  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
-
-#if defined(_WIN64)
-  THArgCheck(position <= (size_t)INT64_MAX, 2, "position must be smaller than INT64_MAX");
-  if(_fseeki64(dfself->handle, (__int64)position, SEEK_SET) < 0)
-#elif defined(_WIN32)
-  THArgCheck(position <= (size_t)LONG_MAX, 2, "position must be smaller than LONG_MAX");
-  if(fseek(dfself->handle, (long)position, SEEK_SET) < 0)
-#else
-  THArgCheck(position <= (size_t)LLONG_MAX, 2, "position must be smaller than LLONG_MAX");
-  if(fseeko(dfself->handle, (off_t)position, SEEK_SET) < 0)
-#endif
-  {
-    dfself->file.hasError = 1;
-    if(!dfself->file.isQuiet)
-      THError("unable to seek to position %zu", position);
-  }
-}
-
-static void THDiskFile_seekEnd(THFile *self)
-{
-  THDiskFile *dfself = (THDiskFile*)(self);
-
-  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
-
-#if defined(_WIN64)
-  if(_fseeki64(dfself->handle, 0, SEEK_END) < 0)
-#elif defined(_WIN32)
-  if(fseek(dfself->handle, 0, SEEK_END) < 0)
-#else
-  if(fseeko(dfself->handle, 0, SEEK_END) < 0)
-#endif
-  {
-    dfself->file.hasError = 1;
-    if(!dfself->file.isQuiet)
-      THError("unable to seek at end of file");
-  }
-}
-
-static size_t THDiskFile_position(THFile *self)
-{
-  THDiskFile *dfself = (THDiskFile*)(self);
-  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
-
-#if defined(_WIN64)
-  __int64 offset = _ftelli64(dfself->handle);
-#elif defined(_WIN32)
-  long offset = ftell(dfself->handle);
-#else
-  off_t offset = ftello(dfself->handle);
-#endif
-  if (offset > -1)
-      return (size_t)offset;
-  else if(!dfself->file.isQuiet)
-      THError("unable to obtain disk file offset (maybe a long overflow occurred)");
-
-  return 0;
-}
-
-static void THDiskFile_close(THFile *self)
-{
-  THDiskFile *dfself = (THDiskFile*)(self);
-  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
-  fclose(dfself->handle);
-  dfself->handle = NULL;
-}
-
-/* Little and Big Endian */
-
-static void THDiskFile_reverseMemory(void *dst, const void *src, size_t blockSize, size_t numBlocks)
-{
-  if(blockSize > 1)
-  {
-    size_t halfBlockSize = blockSize/2;
-    char *charSrc = (char*)src;
-    char *charDst = (char*)dst;
-    size_t b, i;
-    for(b = 0; b < numBlocks; b++)
-    {
-      for(i = 0; i < halfBlockSize; i++)
-      {
-        char z = charSrc[i];
-        charDst[i] = charSrc[blockSize-1-i];
-        charDst[blockSize-1-i] = z;
-      }
-      charSrc += blockSize;
-      charDst += blockSize;
-    }
-  }
-}
-
-int THDiskFile_isLittleEndianCPU(void)
-{
-  int x = 7;
-  char *ptr = (char *)&x;
-
-  if(ptr[0] == 0)
-    return 0;
-  else
-    return 1;
-}
-
-int THDiskFile_isBigEndianCPU(void)
-{
-  return(!THDiskFile_isLittleEndianCPU());
-}
-
-void THDiskFile_nativeEndianEncoding(THFile *self)
-{
-  THDiskFile *dfself = (THDiskFile*)(self);
-  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
-  dfself->isNativeEncoding = 1;
-}
-
-void THDiskFile_littleEndianEncoding(THFile *self)
-{
-  THDiskFile *dfself = (THDiskFile*)(self);
-  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
-  dfself->isNativeEncoding = THDiskFile_isLittleEndianCPU();
-}
-
-void THDiskFile_bigEndianEncoding(THFile *self)
-{
-  THDiskFile *dfself = (THDiskFile*)(self);
-  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
-  dfself->isNativeEncoding = !THDiskFile_isLittleEndianCPU();
-}
-
-/* End of Little and Big Endian Stuff */
-
-void THDiskFile_longSize(THFile *self, int size)
-{
-  THDiskFile *dfself = (THDiskFile*)(self);
-  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
-  THArgCheck(size == 0 || size == 4 || size == 8, 1, "Invalid long size specified");
-  dfself->longSize = size;
-}
-
-void THDiskFile_noBuffer(THFile *self)
-{
-  THDiskFile *dfself = (THDiskFile*)(self);
-  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
-  if (setvbuf(dfself->handle, NULL, _IONBF, 0)) {
-    THError("error: cannot disable buffer");
-  }
-}
-
-static void THDiskFile_free(THFile *self)
-{
-  THDiskFile *dfself = (THDiskFile*)(self);
-  if(dfself->handle)
-    fclose(dfself->handle);
-  THFree(dfself->name);
-  THFree(dfself);
-}
-
-/* READ_WRITE_METHODS(int, Bool, */
-/*                    int value = 0; int ret = fscanf(file->handle, "%d", &value); array[i] = (value ? 1 : 0); if(ret <= 0) break; else result++, */
-/*                    int value = (array[i] ? 1 : 0); nElemWritten = fprintf(file->handle, "%d", value), */
-/*                    true) */
-
-/* Note that we do a trick */
-READ_WRITE_METHODS(unsigned char, Byte,
-                   nread = fread(data, 1, n, dfself->handle); break,
-                   nwrite = fwrite(data, 1, n, dfself->handle); break)
-
-READ_WRITE_METHODS(char, Char,
-                   nread = fread(data, 1, n, dfself->handle); break,
-                   nwrite = fwrite(data, 1, n, dfself->handle); break)
-
-READ_WRITE_METHODS(short, Short,
-                   int ret = fscanf(dfself->handle, "%hd", &data[i]); if(ret <= 0) break; else nread++,
-                   int ret = fprintf(dfself->handle, "%hd", data[i]); if(ret <= 0) break; else nwrite++)
-
-READ_WRITE_METHODS(int, Int,
-                   int ret = fscanf(dfself->handle, "%d", &data[i]); if(ret <= 0) break; else nread++,
-                   int ret = fprintf(dfself->handle, "%d", data[i]); if(ret <= 0) break; else nwrite++)
-
-READ_WRITE_METHODS(float, Float,
-                   int ret = fscanf(dfself->handle, "%g", &data[i]); if(ret <= 0) break; else nread++,
-                   int ret = fprintf(dfself->handle, "%.9g", data[i]); if(ret <= 0) break; else nwrite++)
-
-READ_WRITE_METHODS(THHalf, Half,
-                   float buf; int ret = fscanf(dfself->handle, "%g", &buf); if(ret <= 0) break; else { data[i]= TH_float2half(buf); nread++; },
-                   int ret = fprintf(dfself->handle, "%.9g", TH_half2float(data[i])); if(ret <= 0) break; else nwrite++)
-
-READ_WRITE_METHODS(double, Double,
-                   int ret = fscanf(dfself->handle, "%lg", &data[i]); if(ret <= 0) break; else nread++,
-                   int ret = fprintf(dfself->handle, "%.17g", data[i]); if(ret <= 0) break; else nwrite++)
-
-
-/* For Long we need to rewrite everything, because of the special management of longSize */
-static size_t THDiskFile_readLong(THFile *self, long *data, size_t n)
-{
-  THDiskFile *dfself = (THDiskFile*)(self);
-  size_t nread = 0L;
-
-  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
-  THArgCheck(dfself->file.isReadable, 1, "attempt to read in a write-only file");
-
-  if(dfself->file.isBinary)
-  {
-    if(dfself->longSize == 0 || dfself->longSize == sizeof(long))
-    {
-      nread = fread__(data, sizeof(long), n, dfself->handle);
-      if(!dfself->isNativeEncoding && (sizeof(long) > 1) && (nread > 0))
-        THDiskFile_reverseMemory(data, data, sizeof(long), nread);
-    } else if(dfself->longSize == 4)
-    {
-      nread = fread__(data, 4, n, dfself->handle);
-      if(!dfself->isNativeEncoding && (nread > 0))
-        THDiskFile_reverseMemory(data, data, 4, nread);
-      size_t i;
-      for(i = nread; i > 0; i--)
-        data[i-1] = ((int *)data)[i-1];
-    }
-    else /* if(dfself->longSize == 8) */
-    {
-      int big_endian = !THDiskFile_isLittleEndianCPU();
-      int32_t *buffer = THAlloc(8*n);
-      nread = fread__(buffer, 8, n, dfself->handle);
-      size_t i;
-      for(i = nread; i > 0; i--)
-        data[i-1] = buffer[2*(i-1) + big_endian];
-      THFree(buffer);
-      if(!dfself->isNativeEncoding && (nread > 0))
-        THDiskFile_reverseMemory(data, data, 4, nread);
-     }
-  }
-  else
-  {
-    size_t i;
-    for(i = 0; i < n; i++)
-    {
-      int ret = fscanf(dfself->handle, "%ld", &data[i]); if(ret <= 0) break; else nread++;
-    }
-    if(dfself->file.isAutoSpacing && (n > 0))
-    {
-      int c = fgetc(dfself->handle);
-      if( (c != '\n') && (c != EOF) )
-        ungetc(c, dfself->handle);
-    }
-  }
-
-  if(nread != n)
-  {
-    dfself->file.hasError = 1; /* shouldn't we put hasError to 0 all the time ? */
-    if(!dfself->file.isQuiet)
-      THError("read error: read %d blocks instead of %d", nread, n);
-  }
-
-  return nread;
-}
-
-static size_t THDiskFile_writeLong(THFile *self, long *data, size_t n)
-{
-  THDiskFile *dfself = (THDiskFile*)(self);
-  size_t nwrite = 0L;
-
-  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
-  THArgCheck(dfself->file.isWritable, 1, "attempt to write in a read-only file");
-
-  if(dfself->file.isBinary)
-  {
-    if(dfself->longSize == 0 || dfself->longSize == sizeof(long))
-    {
-      if(dfself->isNativeEncoding)
-      {
-        nwrite = fwrite(data, sizeof(long), n, dfself->handle);
-      }
-      else
-      {
-        char *buffer = THAlloc(sizeof(long)*n);
-        THDiskFile_reverseMemory(buffer, data, sizeof(long), n);
-        nwrite = fwrite(buffer, sizeof(long), n, dfself->handle);
-        THFree(buffer);
-      }
-    } else if(dfself->longSize == 4)
-    {
-      int32_t *buffer = THAlloc(4*n);
-      size_t i;
-      for(i = 0; i < n; i++)
-        buffer[i] = data[i];
-      if(!dfself->isNativeEncoding)
-        THDiskFile_reverseMemory(buffer, buffer, 4, n);
-      nwrite = fwrite(buffer, 4, n, dfself->handle);
-      THFree(buffer);
-    }
-    else /* if(dfself->longSize == 8) */
-    {
-      int big_endian = !THDiskFile_isLittleEndianCPU();
-      int32_t *buffer = THAlloc(8*n);
-      size_t i;
-      for(i = 0; i < n; i++)
-      {
-        buffer[2*i + !big_endian] = 0;
-        buffer[2*i + big_endian] = data[i];
-      }
-      if(!dfself->isNativeEncoding)
-        THDiskFile_reverseMemory(buffer, buffer, 8, n);
-      nwrite = fwrite(buffer, 8, n, dfself->handle);
-      THFree(buffer);
-    }
-  }
-  else
-  {
-    size_t i;
-    for(i = 0; i < n; i++)
-    {
-      int ret = fprintf(dfself->handle, "%ld", data[i]); if(ret <= 0) break; else nwrite++;
-      if( dfself->file.isAutoSpacing && (i < n-1) )
-        fprintf(dfself->handle, " ");
-    }
-    if(dfself->file.isAutoSpacing && (n > 0))
-      fprintf(dfself->handle, "\n");
-  }
-
-  if(nwrite != n)
-  {
-    dfself->file.hasError = 1;
-    if(!dfself->file.isQuiet)
-      THError("write error: wrote %d blocks instead of %d", nwrite, n);
-  }
-
-  return nwrite;
-}
-
-static size_t THDiskFile_readString(THFile *self, const char *format, char **str_)
-{
-  THDiskFile *dfself = (THDiskFile*)(self);
-  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
-  THArgCheck(dfself->file.isReadable, 1, "attempt to read in a write-only file");
-  THArgCheck((strlen(format) >= 2 ? (format[0] == '*') && (format[1] == 'a' || format[1] == 'l') : 0), 2, "format must be '*a' or '*l'");
-
-/* note: the string won't survive long, as it is copied into lua */
-/* so 1024 is not that big... */
-#define TBRS_BSZ 1024L
-
-  if(format[1] == 'a')
-  {
-    char *p = THAlloc(TBRS_BSZ);
-    size_t total = TBRS_BSZ;
-    size_t pos = 0;
-
-    for (;;)
-    {
-      if(total-pos == 0) /* we need more space! */
-      {
-        total += TBRS_BSZ;
-        p = THRealloc(p, total);
-      }
-      pos += fread(p+pos, 1, total-pos, dfself->handle);
-      if (pos < total) /* eof? */
-      {
-        if(pos == 0)
-        {
-          THFree(p);
-          dfself->file.hasError = 1;
-          if(!dfself->file.isQuiet)
-            THError("read error: read 0 blocks instead of 1");
-
-          *str_ = NULL;
-          return 0;
-        }
-        *str_ = p;
-        return pos;
-      }
-    }
-  }
-  else
-  {
-    char *p = THAlloc(TBRS_BSZ);
-    size_t total = TBRS_BSZ;
-    size_t pos = 0;
-    size_t size;
-
-    for (;;)
-    {
-      if(total-pos <= 1) /* we can only write '\0' in there! */
-      {
-        total += TBRS_BSZ;
-        p = THRealloc(p, total);
-      }
-      if (fgets(p+pos, total-pos, dfself->handle) == NULL) /* eof? */
-      {
-        if(pos == 0)
-        {
-          THFree(p);
-          dfself->file.hasError = 1;
-          if(!dfself->file.isQuiet)
-            THError("read error: read 0 blocks instead of 1");
-
-          *str_ = NULL;
-          return 0;
-        }
-        *str_ = p;
-        return pos;
-      }
-      size = strlen(p+pos);
-      if (size == 0 || (p+pos)[size-1] != '\n')
-      {
-        pos += size;
-      }
-      else
-      {
-        pos += size-1; /* do not include `eol' */
-        *str_ = p;
-        return pos;
-      }
-    }
-  }
-
-  *str_ = NULL;
-  return 0;
-}
-
-
-static size_t THDiskFile_writeString(THFile *self, const char *str, size_t size)
-{
-  THDiskFile *dfself = (THDiskFile*)(self);
-  size_t nwrite;
-
-  THArgCheck(dfself->handle != NULL, 1, "attempt to use a closed file");
-  THArgCheck(dfself->file.isWritable, 1, "attempt to write in a read-only file");
-
-  nwrite = fwrite(str, 1, size, dfself->handle);
-  if(nwrite != size)
-  {
-    dfself->file.hasError = 1;
-    if(!dfself->file.isQuiet)
-      THError("write error: wrote %zu blocks instead of %zu", nwrite, size);
-  }
-
-  return nwrite;
-}
-
-THFile *THDiskFile_new(const char *name, const char *mode, int isQuiet)
-{
-  static struct THFileVTable vtable = {
-    THDiskFile_isOpened,
-
-    THDiskFile_readByte,
-    THDiskFile_readChar,
-    THDiskFile_readShort,
-    THDiskFile_readInt,
-    THDiskFile_readLong,
-    THDiskFile_readFloat,
-    THDiskFile_readDouble,
-    THDiskFile_readHalf,
-    THDiskFile_readString,
-
-    THDiskFile_writeByte,
-    THDiskFile_writeChar,
-    THDiskFile_writeShort,
-    THDiskFile_writeInt,
-    THDiskFile_writeLong,
-    THDiskFile_writeFloat,
-    THDiskFile_writeDouble,
-    THDiskFile_writeHalf,
-    THDiskFile_writeString,
-
-    THDiskFile_synchronize,
-    THDiskFile_seek,
-    THDiskFile_seekEnd,
-    THDiskFile_position,
-    THDiskFile_close,
-    THDiskFile_free
-  };
-
-  int isReadable;
-  int isWritable;
-  FILE *handle;
-  THDiskFile *self;
-
-  THArgCheck(THDiskFile_mode(mode, &isReadable, &isWritable), 2, "file mode should be 'r','w' or 'rw'");
-
-  if( isReadable && isWritable )
-  {
-    handle = fopen(name, "r+b");
-    if(!handle)
-    {
-      handle = fopen(name, "wb");
-      if(handle)
-      {
-        fclose(handle);
-        handle = fopen(name, "r+b");
-      }
-    }
-  }
-  else
-    handle = fopen(name, (isReadable ? "rb" : "wb"));
-
-  if(!handle)
-  {
-    if(isQuiet)
-      return 0;
-    else
-      THError("cannot open <%s> in mode %c%c", name, (isReadable ? 'r' : ' '), (isWritable ? 'w' : ' '));
-  }
-
-  self = THAlloc(sizeof(THDiskFile));
-
-  self->handle = handle;
-  self->name = THAlloc(strlen(name)+1);
-  strcpy(self->name, name);
-  self->isNativeEncoding = 1;
-  self->longSize = 0;
-
-  self->file.vtable = &vtable;
-  self->file.isQuiet = isQuiet;
-  self->file.isReadable = isReadable;
-  self->file.isWritable = isWritable;
-  self->file.isBinary = 0;
-  self->file.isAutoSpacing = 1;
-  self->file.hasError = 0;
-
-  return (THFile*)self;
-}
-
-/* PipeFile */
-
-static int THPipeFile_mode(const char *mode, int *isReadable, int *isWritable)
-{
-  *isReadable = 0;
-  *isWritable = 0;
-  if(strlen(mode) == 1)
-  {
-    if(*mode == 'r')
-    {
-      *isReadable = 1;
-      return 1;
-    }
-    else if(*mode == 'w')
-    {
-      *isWritable = 1;
-      return 1;
-    }
-  }
-  return 0;
-}
-
-static void THPipeFile_free(THFile *self)
-{
-  THDiskFile *dfself = (THDiskFile*)(self);
-  if(dfself->handle)
-    pclose(dfself->handle);
-  THFree(dfself->name);
-  THFree(dfself);
-}
-
-THFile *THPipeFile_new(const char *name, const char *mode, int isQuiet)
-{
-  static struct THFileVTable vtable = {
-    THDiskFile_isOpened,
-
-    THDiskFile_readByte,
-    THDiskFile_readChar,
-    THDiskFile_readShort,
-    THDiskFile_readInt,
-    THDiskFile_readLong,
-    THDiskFile_readFloat,
-    THDiskFile_readDouble,
-    THDiskFile_readHalf,
-    THDiskFile_readString,
-
-    THDiskFile_writeByte,
-    THDiskFile_writeChar,
-    THDiskFile_writeShort,
-    THDiskFile_writeInt,
-    THDiskFile_writeLong,
-    THDiskFile_writeFloat,
-    THDiskFile_writeDouble,
-    THDiskFile_writeHalf,
-    THDiskFile_writeString,
-
-    THDiskFile_synchronize,
-    THDiskFile_seek,
-    THDiskFile_seekEnd,
-    THDiskFile_position,
-    THDiskFile_close,
-    THPipeFile_free
-  };
-
-  int isReadable;
-  int isWritable;
-  FILE *handle;
-  THDiskFile *self;
-
-  THArgCheck(THPipeFile_mode(mode, &isReadable, &isWritable), 2, "file mode should be 'r','w'");
-
-#ifdef _WIN32
-  handle = _popen(name, (isReadable ? "rb" : "wb"));
-#else
-  handle = popen(name, (isReadable ? "r" : "w"));
-#endif
-
-  if(!handle)
-  {
-    if(isQuiet)
-      return 0;
-    else
-      THError("cannot open <%s> in mode %c%c.  This might be because eg the executable doesn't exist, but it could also be because you are out of memory.", name, (isReadable ? 'r' : ' '), (isWritable ? 'w' : ' '));
-  }
-
-  self = THAlloc(sizeof(THDiskFile));
-
-  self->handle = handle;
-  self->name = THAlloc(strlen(name)+1);
-  strcpy(self->name, name);
-  self->isNativeEncoding = 1;
-  self->longSize = 0;
-
-  self->file.vtable = &vtable;
-  self->file.isQuiet = isQuiet;
-  self->file.isReadable = isReadable;
-  self->file.isWritable = isWritable;
-  self->file.isBinary = 0;
-  self->file.isAutoSpacing = 1;
-  self->file.hasError = 0;
-
-  return (THFile*)self;
-}
diff --git a/contrib/lua-torch/torch7/lib/TH/THDiskFile.h b/contrib/lua-torch/torch7/lib/TH/THDiskFile.h
deleted file mode 100644
index bc5c001c7..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THDiskFile.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef TH_DISK_FILE_INC
-#define TH_DISK_FILE_INC
-
-#include "THFile.h"
-
-TH_API THFile *THDiskFile_new(const char *name, const char *mode, int isQuiet);
-TH_API THFile *THPipeFile_new(const char *name, const char *mode, int isQuiet);
-
-TH_API const char *THDiskFile_name(THFile *self);
-
-TH_API int THDiskFile_isLittleEndianCPU(void);
-TH_API int THDiskFile_isBigEndianCPU(void);
-TH_API void THDiskFile_nativeEndianEncoding(THFile *self);
-TH_API void THDiskFile_littleEndianEncoding(THFile *self);
-TH_API void THDiskFile_bigEndianEncoding(THFile *self);
-TH_API void THDiskFile_longSize(THFile *self, int size);
-TH_API void THDiskFile_noBuffer(THFile *self);
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THFile.c b/contrib/lua-torch/torch7/lib/TH/THFile.c
deleted file mode 100644
index 3717b7b5c..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THFile.c
+++ /dev/null
@@ -1,157 +0,0 @@
-#include "THFile.h"
-#include "THFilePrivate.h"
-
-#define IMPLEMENT_THFILE_RW(TYPEC, TYPE)                          \
-  size_t THFile_read##TYPEC##Raw(THFile *self, TYPE *data, size_t n)  \
-  {                                                               \
-    return (*self->vtable->read##TYPEC)(self, data, n);           \
-  }                                                               \
-                                                                  \
-  size_t THFile_write##TYPEC##Raw(THFile *self, TYPE *data, size_t n) \
-  {                                                               \
-    return (*self->vtable->write##TYPEC)(self, data, n);          \
-  }
-
-IMPLEMENT_THFILE_RW(Byte, unsigned char)
-IMPLEMENT_THFILE_RW(Char, char)
-IMPLEMENT_THFILE_RW(Short, short)
-IMPLEMENT_THFILE_RW(Int, int)
-IMPLEMENT_THFILE_RW(Long, long)
-IMPLEMENT_THFILE_RW(Float, float)
-IMPLEMENT_THFILE_RW(Double, double)
-IMPLEMENT_THFILE_RW(Half, THHalf)
-
-size_t THFile_readStringRaw(THFile *self, const char *format, char **str_)
-{
-  return self->vtable->readString(self, format, str_);
-}
-
-size_t THFile_writeStringRaw(THFile *self, const char *str, size_t size)
-{
-  return self->vtable->writeString(self, str, size);
-}
-
-void THFile_synchronize(THFile *self)
-{
-  self->vtable->synchronize(self);
-}
-
-void THFile_seek(THFile *self, size_t position)
-{
-  self->vtable->seek(self, position);
-}
-
-void THFile_seekEnd(THFile *self)
-{
-  self->vtable->seekEnd(self);
-}
-
-size_t THFile_position(THFile *self)
-{
-  return self->vtable->position(self);
-}
-
-void THFile_close(THFile *self)
-{
-  self->vtable->close(self);
-}
-
-void THFile_free(THFile *self)
-{
-  self->vtable->free(self);
-}
-
-int THFile_isOpened(THFile *self)
-{
-  return self->vtable->isOpened(self);
-}
-
-#define IMPLEMENT_THFILE_FLAGS(FLAG) \
-  int THFile_##FLAG(THFile *self)    \
-  {                                  \
-    return self->FLAG;               \
-  }
-
-IMPLEMENT_THFILE_FLAGS(isQuiet)
-IMPLEMENT_THFILE_FLAGS(isReadable)
-IMPLEMENT_THFILE_FLAGS(isWritable)
-IMPLEMENT_THFILE_FLAGS(isBinary)
-IMPLEMENT_THFILE_FLAGS(isAutoSpacing)
-IMPLEMENT_THFILE_FLAGS(hasError)
-
-void THFile_binary(THFile *self)
-{
-  self->isBinary = 1;
-}
-
-void THFile_ascii(THFile *self)
-{
-  self->isBinary = 0;
-}
-
-void THFile_autoSpacing(THFile *self)
-{
-  self->isAutoSpacing = 1;
-}
-
-void THFile_noAutoSpacing(THFile *self)
-{
-  self->isAutoSpacing = 0;
-}
-
-void THFile_quiet(THFile *self)
-{
-  self->isQuiet = 1;
-}
-
-void THFile_pedantic(THFile *self)
-{
-  self->isQuiet = 0;
-}
-
-void THFile_clearError(THFile *self)
-{
-  self->hasError = 0;
-}
-
-#define IMPLEMENT_THFILE_SCALAR(TYPEC, TYPE)                  \
-  TYPE THFile_read##TYPEC##Scalar(THFile *self)               \
-  {                                                           \
-    TYPE scalar;                                              \
-    THFile_read##TYPEC##Raw(self, &scalar, 1);                \
-    return scalar;                                            \
-  }                                                           \
-                                                              \
-  void THFile_write##TYPEC##Scalar(THFile *self, TYPE scalar) \
-  {                                                           \
-    THFile_write##TYPEC##Raw(self, &scalar, 1);               \
-  }
-
-IMPLEMENT_THFILE_SCALAR(Byte, unsigned char)
-IMPLEMENT_THFILE_SCALAR(Char, char)
-IMPLEMENT_THFILE_SCALAR(Short, short)
-IMPLEMENT_THFILE_SCALAR(Int, int)
-IMPLEMENT_THFILE_SCALAR(Long, long)
-IMPLEMENT_THFILE_SCALAR(Float, float)
-IMPLEMENT_THFILE_SCALAR(Double, double)
-IMPLEMENT_THFILE_SCALAR(Half, THHalf)
-
-#define IMPLEMENT_THFILE_STORAGE(TYPEC, TYPE)                           \
-  size_t THFile_read##TYPEC(THFile *self, TH##TYPEC##Storage *storage)    \
-  {                                                                     \
-    return THFile_read##TYPEC##Raw(self, storage->data, storage->size); \
-  }                                                                     \
-                                                                        \
-  size_t THFile_write##TYPEC(THFile *self, TH##TYPEC##Storage *storage)   \
-  {                                                                     \
-    return THFile_write##TYPEC##Raw(self, storage->data, storage->size); \
-  }
-
-IMPLEMENT_THFILE_STORAGE(Byte, unsigned char)
-IMPLEMENT_THFILE_STORAGE(Char, char)
-IMPLEMENT_THFILE_STORAGE(Short, short)
-IMPLEMENT_THFILE_STORAGE(Int, int)
-IMPLEMENT_THFILE_STORAGE(Long, long)
-IMPLEMENT_THFILE_STORAGE(Float, float)
-IMPLEMENT_THFILE_STORAGE(Double, double)
-IMPLEMENT_THFILE_STORAGE(Half, THHalf)
diff --git a/contrib/lua-torch/torch7/lib/TH/THFile.h b/contrib/lua-torch/torch7/lib/TH/THFile.h
deleted file mode 100644
index e097bdf34..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THFile.h
+++ /dev/null
@@ -1,91 +0,0 @@
-#ifndef TH_FILE_INC
-#define TH_FILE_INC
-
-#include "THStorage.h"
-
-typedef struct THFile__ THFile;
-
-TH_API int THFile_isOpened(THFile *self);
-TH_API int THFile_isQuiet(THFile *self);
-TH_API int THFile_isReadable(THFile *self);
-TH_API int THFile_isWritable(THFile *self);
-TH_API int THFile_isBinary(THFile *self);
-TH_API int THFile_isAutoSpacing(THFile *self);
-TH_API int THFile_hasError(THFile *self);
-
-TH_API void THFile_binary(THFile *self);
-TH_API void THFile_ascii(THFile *self);
-TH_API void THFile_autoSpacing(THFile *self);
-TH_API void THFile_noAutoSpacing(THFile *self);
-TH_API void THFile_quiet(THFile *self);
-TH_API void THFile_pedantic(THFile *self);
-TH_API void THFile_clearError(THFile *self);
-
-/* scalar */
-TH_API unsigned char THFile_readByteScalar(THFile *self);
-TH_API char THFile_readCharScalar(THFile *self);
-TH_API short THFile_readShortScalar(THFile *self);
-TH_API int THFile_readIntScalar(THFile *self);
-TH_API long THFile_readLongScalar(THFile *self);
-TH_API float THFile_readFloatScalar(THFile *self);
-TH_API double THFile_readDoubleScalar(THFile *self);
-
-TH_API void THFile_writeByteScalar(THFile *self, unsigned char scalar);
-TH_API void THFile_writeCharScalar(THFile *self, char scalar);
-TH_API void THFile_writeShortScalar(THFile *self, short scalar);
-TH_API void THFile_writeIntScalar(THFile *self, int scalar);
-TH_API void THFile_writeLongScalar(THFile *self, long scalar);
-TH_API void THFile_writeFloatScalar(THFile *self, float scalar);
-TH_API void THFile_writeDoubleScalar(THFile *self, double scalar);
-
-/* storage */
-TH_API size_t THFile_readByte(THFile *self, THByteStorage *storage);
-TH_API size_t THFile_readChar(THFile *self, THCharStorage *storage);
-TH_API size_t THFile_readShort(THFile *self, THShortStorage *storage);
-TH_API size_t THFile_readInt(THFile *self, THIntStorage *storage);
-TH_API size_t THFile_readLong(THFile *self, THLongStorage *storage);
-TH_API size_t THFile_readFloat(THFile *self, THFloatStorage *storage);
-TH_API size_t THFile_readDouble(THFile *self, THDoubleStorage *storage);
-
-TH_API size_t THFile_writeByte(THFile *self, THByteStorage *storage);
-TH_API size_t THFile_writeChar(THFile *self, THCharStorage *storage);
-TH_API size_t THFile_writeShort(THFile *self, THShortStorage *storage);
-TH_API size_t THFile_writeInt(THFile *self, THIntStorage *storage);
-TH_API size_t THFile_writeLong(THFile *self, THLongStorage *storage);
-TH_API size_t THFile_writeFloat(THFile *self, THFloatStorage *storage);
-TH_API size_t THFile_writeDouble(THFile *self, THDoubleStorage *storage);
-
-/* raw */
-TH_API size_t THFile_readByteRaw(THFile *self, unsigned char *data, size_t n);
-TH_API size_t THFile_readCharRaw(THFile *self, char *data, size_t n);
-TH_API size_t THFile_readShortRaw(THFile *self, short *data, size_t n);
-TH_API size_t THFile_readIntRaw(THFile *self, int *data, size_t n);
-TH_API size_t THFile_readLongRaw(THFile *self, long *data, size_t n);
-TH_API size_t THFile_readFloatRaw(THFile *self, float *data, size_t n);
-TH_API size_t THFile_readDoubleRaw(THFile *self, double *data, size_t n);
-TH_API size_t THFile_readStringRaw(THFile *self, const char *format, char **str_); /* you must deallocate str_ */
-
-TH_API size_t THFile_writeByteRaw(THFile *self, unsigned char *data, size_t n);
-TH_API size_t THFile_writeCharRaw(THFile *self, char *data, size_t n);
-TH_API size_t THFile_writeShortRaw(THFile *self, short *data, size_t n);
-TH_API size_t THFile_writeIntRaw(THFile *self, int *data, size_t n);
-TH_API size_t THFile_writeLongRaw(THFile *self, long *data, size_t n);
-TH_API size_t THFile_writeFloatRaw(THFile *self, float *data, size_t n);
-TH_API size_t THFile_writeDoubleRaw(THFile *self, double *data, size_t n);
-TH_API size_t THFile_writeStringRaw(THFile *self, const char *str, size_t size);
-
-TH_API THHalf THFile_readHalfScalar(THFile *self);
-TH_API void THFile_writeHalfScalar(THFile *self, THHalf scalar);
-TH_API size_t THFile_readHalf(THFile *self, THHalfStorage *storage);
-TH_API size_t THFile_writeHalf(THFile *self, THHalfStorage *storage);
-TH_API size_t THFile_readHalfRaw(THFile *self, THHalf* data, size_t size);
-TH_API size_t THFile_writeHalfRaw(THFile *self, THHalf* data, size_t size);
-
-TH_API void THFile_synchronize(THFile *self);
-TH_API void THFile_seek(THFile *self, size_t position);
-TH_API void THFile_seekEnd(THFile *self);
-TH_API size_t THFile_position(THFile *self);
-TH_API void THFile_close(THFile *self);
-TH_API void THFile_free(THFile *self);
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THFilePrivate.h b/contrib/lua-torch/torch7/lib/TH/THFilePrivate.h
deleted file mode 100644
index 55169c3bc..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THFilePrivate.h
+++ /dev/null
@@ -1,50 +0,0 @@
-#include "THGeneral.h"
-
-#include "THHalf.h"
-
-
-struct THFile__
-{
-    struct THFileVTable *vtable;
-
-    int isQuiet;
-    int isReadable;
-    int isWritable;
-    int isBinary;
-    int isAutoSpacing;
-    int hasError;
-};
-
-/* virtual table definition */
-
-struct THFileVTable
-{
-    int (*isOpened)(THFile *self);
-
-    size_t (*readByte)(THFile *self, unsigned char *data, size_t n);
-    size_t (*readChar)(THFile *self, char *data, size_t n);
-    size_t (*readShort)(THFile *self, short *data, size_t n);
-    size_t (*readInt)(THFile *self, int *data, size_t n);
-    size_t (*readLong)(THFile *self, long *data, size_t n);
-    size_t (*readFloat)(THFile *self, float *data, size_t n);
-    size_t (*readDouble)(THFile *self, double *data, size_t n);
-    size_t (*readHalf)(THFile *self, THHalf *data, size_t n);
-    size_t (*readString)(THFile *self, const char *format, char **str_);
-
-    size_t (*writeByte)(THFile *self, unsigned char *data, size_t n);
-    size_t (*writeChar)(THFile *self, char *data, size_t n);
-    size_t (*writeShort)(THFile *self, short *data, size_t n);
-    size_t (*writeInt)(THFile *self, int *data, size_t n);
-    size_t (*writeLong)(THFile *self, long *data, size_t n);
-    size_t (*writeFloat)(THFile *self, float *data, size_t n);
-    size_t (*writeDouble)(THFile *self, double *data, size_t n);
-    size_t (*writeHalf)(THFile *self, THHalf *data, size_t n);
-    size_t (*writeString)(THFile *self, const char *str, size_t size);
-
-    void (*synchronize)(THFile *self);
-    void (*seek)(THFile *self, size_t position);
-    void (*seekEnd)(THFile *self);
-    size_t (*position)(THFile *self);
-    void (*close)(THFile *self);
-    void (*free)(THFile *self);
-};
diff --git a/contrib/lua-torch/torch7/lib/TH/THGeneral.c b/contrib/lua-torch/torch7/lib/TH/THGeneral.c
deleted file mode 100644
index f093c422f..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THGeneral.c
+++ /dev/null
@@ -1,406 +0,0 @@
-#include "THGeneral.h"
-#include "THAtomic.h"
-
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-#ifndef TH_HAVE_THREAD
-#define __thread
-#elif _MSC_VER
-#define __thread __declspec( thread )
-#endif
-
-#if defined(__APPLE__)
-#include <malloc/malloc.h>
-#endif
-
-#if defined(__linux__)
-#include <malloc.h>
-#endif
-
-#if defined(__FreeBSD__)
-#include <malloc_np.h>
-#endif
-
-/* Torch Error Handling */
-static void defaultErrorHandlerFunction(const char *msg, void *data)
-{
-  printf("$ Error: %s\n", msg);
-  abort();
-}
-
-static THErrorHandlerFunction defaultErrorHandler = defaultErrorHandlerFunction;
-static void *defaultErrorHandlerData;
-static __thread THErrorHandlerFunction threadErrorHandler = NULL;
-static __thread void *threadErrorHandlerData;
-
-void _THError(const char *file, const int line, const char *fmt, ...)
-{
-  char msg[2048];
-  va_list args;
-
-  /* vasprintf not standard */
-  /* vsnprintf: how to handle if does not exists? */
-  va_start(args, fmt);
-  int n = vsnprintf(msg, 2048, fmt, args);
-  va_end(args);
-
-  if(n < 2048) {
-    snprintf(msg + n, 2048 - n, " at %s:%d", file, line);
-  }
-
-  if (threadErrorHandler)
-    (*threadErrorHandler)(msg, threadErrorHandlerData);
-  else
-    (*defaultErrorHandler)(msg, defaultErrorHandlerData);
-}
-
-void _THAssertionFailed(const char *file, const int line, const char *exp, const char *fmt, ...) {
-  char msg[1024];
-  va_list args;
-  va_start(args, fmt);
-  vsnprintf(msg, 1024, fmt, args);
-  va_end(args);
-  _THError(file, line, "Assertion `%s' failed. %s", exp, msg);
-}
-
-void THSetErrorHandler(THErrorHandlerFunction new_handler, void *data)
-{
-  threadErrorHandler = new_handler;
-  threadErrorHandlerData = data;
-}
-
-void THSetDefaultErrorHandler(THErrorHandlerFunction new_handler, void *data)
-{
-  if (new_handler)
-    defaultErrorHandler = new_handler;
-  else
-    defaultErrorHandler = defaultErrorHandlerFunction;
-  defaultErrorHandlerData = data;
-}
-
-/* Torch Arg Checking Handling */
-static void defaultArgErrorHandlerFunction(int argNumber, const char *msg, void *data)
-{
-  if(msg)
-    printf("$ Invalid argument %d: %s\n", argNumber, msg);
-  else
-    printf("$ Invalid argument %d\n", argNumber);
-  exit(-1);
-}
-
-static THArgErrorHandlerFunction defaultArgErrorHandler = defaultArgErrorHandlerFunction;
-static void *defaultArgErrorHandlerData;
-static __thread THArgErrorHandlerFunction threadArgErrorHandler = NULL;
-static __thread void *threadArgErrorHandlerData;
-
-void _THArgCheck(const char *file, int line, int condition, int argNumber, const char *fmt, ...)
-{
-  if(!condition) {
-    char msg[2048];
-    va_list args;
-
-    /* vasprintf not standard */
-    /* vsnprintf: how to handle if does not exists? */
-    va_start(args, fmt);
-    int n = vsnprintf(msg, 2048, fmt, args);
-    va_end(args);
-
-    if(n < 2048) {
-      snprintf(msg + n, 2048 - n, " at %s:%d", file, line);
-    }
-
-    if (threadArgErrorHandler)
-      (*threadArgErrorHandler)(argNumber, msg, threadArgErrorHandlerData);
-    else
-      (*defaultArgErrorHandler)(argNumber, msg, defaultArgErrorHandlerData);
-  }
-}
-
-void THSetArgErrorHandler(THArgErrorHandlerFunction new_handler, void *data)
-{
-  threadArgErrorHandler = new_handler;
-  threadArgErrorHandlerData = data;
-}
-
-void THSetDefaultArgErrorHandler(THArgErrorHandlerFunction new_handler, void *data)
-{
-  if (new_handler)
-    defaultArgErrorHandler = new_handler;
-  else
-    defaultArgErrorHandler = defaultArgErrorHandlerFunction;
-  defaultArgErrorHandlerData = data;
-}
-
-static __thread void (*torchGCFunction)(void *data) = NULL;
-static __thread void *torchGCData;
-static ptrdiff_t heapSize = 0;
-static __thread ptrdiff_t heapDelta = 0;
-static const ptrdiff_t heapMaxDelta = (ptrdiff_t)1e6; // limit to +/- 1MB before updating heapSize
-static const ptrdiff_t heapMinDelta = (ptrdiff_t)-1e6;
-static __thread ptrdiff_t heapSoftmax = (ptrdiff_t)3e8; // 300MB, adjusted upward dynamically
-static const double heapSoftmaxGrowthThresh = 0.8; // grow softmax if >80% max after GC
-static const double heapSoftmaxGrowthFactor = 1.4; // grow softmax by 40%
-
-/* Optional hook for integrating with a garbage-collected frontend.
- *
- * If torch is running with a garbage-collected frontend (e.g. Lua),
- * the GC isn't aware of TH-allocated memory so may not know when it
- * needs to run. These hooks trigger the GC to run in two cases:
- *
- * (1) When a memory allocation (malloc, realloc, ...) fails
- * (2) When the total TH-allocated memory hits a dynamically-adjusted
- *     soft maximum.
- */
-void THSetGCHandler( void (*torchGCFunction_)(void *data), void *data )
-{
-  torchGCFunction = torchGCFunction_;
-  torchGCData = data;
-}
-
-/* it is guaranteed the allocated size is not bigger than PTRDIFF_MAX */
-static ptrdiff_t getAllocSize(void *ptr) {
-#if defined(__unix) && defined(HAVE_MALLOC_USABLE_SIZE)
-  return malloc_usable_size(ptr);
-#elif defined(__APPLE__)
-  return malloc_size(ptr);
-#elif defined(_WIN32)
-  if(ptr) { return _msize(ptr); } else { return 0; }
-#else
-  return 0;
-#endif
-}
-
-static ptrdiff_t applyHeapDelta() {
-  ptrdiff_t oldHeapSize = THAtomicAddPtrdiff(&heapSize, heapDelta);
-#ifdef DEBUG
-  if (heapDelta > 0 && oldHeapSize > PTRDIFF_MAX - heapDelta)
-    THError("applyHeapDelta: heapSize(%td) + increased(%td) > PTRDIFF_MAX, heapSize overflow!", oldHeapSize, heapDelta);
-  if (heapDelta < 0 && oldHeapSize < PTRDIFF_MIN - heapDelta)
-    THError("applyHeapDelta: heapSize(%td) + decreased(%td) < PTRDIFF_MIN, heapSize underflow!", oldHeapSize, heapDelta);
-#endif
-  ptrdiff_t newHeapSize = oldHeapSize + heapDelta;
-  heapDelta = 0;
-  return newHeapSize;
-}
-
-/* (1) if the torch-allocated heap size exceeds the soft max, run GC
- * (2) if post-GC heap size exceeds 80% of the soft max, increase the
- *     soft max by 40%
- */
-static void maybeTriggerGC(ptrdiff_t curHeapSize) {
-  if (torchGCFunction && curHeapSize > heapSoftmax) {
-    torchGCFunction(torchGCData);
-
-    // ensure heapSize is accurate before updating heapSoftmax
-    ptrdiff_t newHeapSize = applyHeapDelta();
-
-    if (newHeapSize > heapSoftmax * heapSoftmaxGrowthThresh) {
-      heapSoftmax = (ptrdiff_t)(heapSoftmax * heapSoftmaxGrowthFactor);
-    }
-  }
-}
-
-// hooks into the TH heap tracking
-void THHeapUpdate(ptrdiff_t size) {
-#ifdef DEBUG
-  if (size > 0 && heapDelta > PTRDIFF_MAX - size)
-    THError("THHeapUpdate: heapDelta(%td) + increased(%td) > PTRDIFF_MAX, heapDelta overflow!", heapDelta, size);
-  if (size < 0 && heapDelta < PTRDIFF_MIN - size)
-    THError("THHeapUpdate: heapDelta(%td) + decreased(%td) < PTRDIFF_MIN, heapDelta underflow!", heapDelta, size);
-#endif
-
-  heapDelta += size;
-
-  // batch updates to global heapSize to minimize thread contention
-  if (heapDelta < heapMaxDelta && heapDelta > heapMinDelta) {
-    return;
-  }
-
-  ptrdiff_t newHeapSize = applyHeapDelta();
-
-  if (size > 0) {
-    maybeTriggerGC(newHeapSize);
-  }
-}
-
-static void* THAllocInternal(ptrdiff_t size)
-{
-  void *ptr;
-
-  if (size > 5120)
-  {
-#if (defined(__unix) || defined(__APPLE__)) && (!defined(DISABLE_POSIX_MEMALIGN))
-    if (posix_memalign(&ptr, 64, size) != 0)
-      ptr = NULL;
-/*
-#elif defined(_WIN32)
-    ptr = _aligned_malloc(size, 64);
-*/
-#else
-    ptr = malloc(size);
-#endif
-  }
-  else
-  {
-    ptr = malloc(size);
-  }
-
-  THHeapUpdate(getAllocSize(ptr));
-  return ptr;
-}
-
-void* THAlloc(ptrdiff_t size)
-{
-  void *ptr;
-
-  if(size < 0)
-    THError("$ Torch: invalid memory size -- maybe an overflow?");
-
-  if(size == 0)
-    return NULL;
-
-  ptr = THAllocInternal(size);
-
-  if(!ptr && torchGCFunction) {
-    torchGCFunction(torchGCData);
-    ptr = THAllocInternal(size);
-  }
-
-  if(!ptr)
-    THError("$ Torch: not enough memory: you tried to allocate %dGB. Buy new RAM!", size/1073741824);
-
-  return ptr;
-}
-
-void* THRealloc(void *ptr, ptrdiff_t size)
-{
-  if(!ptr)
-    return(THAlloc(size));
-
-  if(size == 0)
-  {
-    THFree(ptr);
-    return NULL;
-  }
-
-  if(size < 0)
-    THError("$ Torch: invalid memory size -- maybe an overflow?");
-
-  ptrdiff_t oldSize = -getAllocSize(ptr);
-  void *newptr = realloc(ptr, size);
-
-  if(!newptr && torchGCFunction) {
-    torchGCFunction(torchGCData);
-    newptr = realloc(ptr, size);
-  }
-
-  if(!newptr)
-    THError("$ Torch: not enough memory: you tried to reallocate %dGB. Buy new RAM!", size/1073741824);
-
-  // update heapSize only after successfully reallocated
-  THHeapUpdate(oldSize + getAllocSize(newptr));
-
-  return newptr;
-}
-
-void THFree(void *ptr)
-{
-  THHeapUpdate(-getAllocSize(ptr));
-  free(ptr);
-}
-
-double THLog1p(const double x)
-{
-#if (defined(_MSC_VER) || defined(__MINGW32__))
-  volatile double y = 1 + x;
-  return log(y) - ((y-1)-x)/y ;  /* cancels errors with IEEE arithmetic */
-#else
-  return log1p(x);
-#endif
-}
-
-void THSetNumThreads(int num_threads)
-{
-#ifdef _OPENMP
-  omp_set_num_threads(num_threads);
-#endif
-#ifdef TH_BLAS_OPEN
-  extern void openblas_set_num_threads(int);
-  openblas_set_num_threads(num_threads);
-#endif
-#ifdef TH_BLAS_MKL
-  extern void mkl_set_num_threads(int);
-  mkl_set_num_threads(num_threads);
-
-#endif
-}
-
-int THGetNumThreads(void)
-{
-  int nthreads = 1;
-#ifdef _OPENMP
-  nthreads = omp_get_max_threads();
-#endif
-#ifdef TH_BLAS_OPEN
-  int bl_threads = 1;
-  extern int openblas_get_num_threads(void);
-  bl_threads = openblas_get_num_threads();
-  nthreads = nthreads > bl_threads ? bl_threads : nthreads;
-#endif
-#ifdef TH_BLAS_MKL
-  int bl_threads = 1;
-  extern int mkl_get_max_threads(void);
-  bl_threads = mkl_get_max_threads();
-  nthreads = nthreads > bl_threads ? bl_threads : nthreads;
-#endif
-  return nthreads;
-}
-
-int THGetNumCores(void)
-{
-#ifdef _OPENMP
-  return omp_get_num_procs();
-#else
-  return 1;
-#endif
-}
-
-#ifdef TH_BLAS_MKL
-extern int mkl_get_max_threads(void);
-#endif
-
-TH_API void THInferNumThreads(void)
-{
-#if defined(_OPENMP) && defined(TH_BLAS_MKL)
-  // If we are using MKL an OpenMP make sure the number of threads match.
-  // Otherwise, MKL and our OpenMP-enabled functions will keep changing the
-  // size of the OpenMP thread pool, resulting in worse performance (and memory
-  // leaks in GCC 5.4)
-  omp_set_num_threads(mkl_get_max_threads());
-#endif
-}
-
-TH_API THDescBuff _THSizeDesc(const long *size, const long ndim) {
-  const int L = TH_DESC_BUFF_LEN;
-  THDescBuff buf;
-  char *str = buf.str;
-  int n = 0;
-  n += snprintf(str, L-n, "[");
-  int i;
-  for(i = 0; i < ndim; i++) {
-    if(n >= L) break;
-    n += snprintf(str+n, L-n, "%ld", size[i]);
-    if(i < ndim-1) {
-      n += snprintf(str+n, L-n, " x ");
-    }
-  }
-  if(n < L - 2) {
-    snprintf(str+n, L-n, "]");
-  } else {
-    snprintf(str+L-5, 5, "...]");
-  }
-  return buf;
-}
-
diff --git a/contrib/lua-torch/torch7/lib/TH/THGeneral.h.in b/contrib/lua-torch/torch7/lib/TH/THGeneral.h.in
deleted file mode 100644
index 88a3934c8..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THGeneral.h.in
+++ /dev/null
@@ -1,130 +0,0 @@
-#ifndef TH_GENERAL_INC
-#define TH_GENERAL_INC
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdarg.h>
-#include <math.h>
-#include <limits.h>
-#include <float.h>
-#include <time.h>
-#include <string.h>
-#include <stddef.h>
-
-#cmakedefine USE_BLAS
-#cmakedefine USE_LAPACK
-#cmakedefine BLAS_F2C
-
-#ifdef __cplusplus
-# define TH_EXTERNC extern "C"
-#else
-# define TH_EXTERNC extern
-#endif
-
-#ifdef _WIN32
-# ifdef TH_EXPORTS
-#  define TH_API TH_EXTERNC __declspec(dllexport)
-# else
-#  define TH_API TH_EXTERNC __declspec(dllimport)
-# endif
-#else
-# define TH_API TH_EXTERNC
-#endif
-
-#ifndef M_PI
-# define M_PI 3.14159265358979323846
-#endif
-
-#ifndef TH_INDEX_BASE
-#define TH_INDEX_BASE 1
-#endif
-
-typedef void (*THErrorHandlerFunction)(const char *msg, void *data);
-typedef void (*THArgErrorHandlerFunction)(int argNumber, const char *msg, void *data);
-
-#define TH_DESC_BUFF_LEN 64
-typedef struct {
-    char str[TH_DESC_BUFF_LEN];
-} THDescBuff;
-
-
-TH_API double THLog1p(const double x);
-TH_API THDescBuff _THSizeDesc(const long *size, const long ndim);
-TH_API void _THError(const char *file, const int line, const char *fmt, ...);
-TH_API void _THAssertionFailed(const char *file, const int line, const char *exp, const char *fmt, ...);
-TH_API void THSetErrorHandler(THErrorHandlerFunction new_handler, void *data);
-TH_API void THSetDefaultErrorHandler(THErrorHandlerFunction new_handler, void *data);
-TH_API void _THArgCheck(const char *file, int line, int condition, int argNumber, const char *fmt, ...);
-TH_API void THSetArgErrorHandler(THArgErrorHandlerFunction new_handler, void *data);
-TH_API void THSetDefaultArgErrorHandler(THArgErrorHandlerFunction new_handler, void *data);
-TH_API void* THAlloc(ptrdiff_t size);
-TH_API void* THRealloc(void *ptr, ptrdiff_t size);
-TH_API void THFree(void *ptr);
-TH_API void THSetGCHandler( void (*torchGCHandlerFunction)(void *data), void *data );
-// this hook should only be called by custom allocator functions
-TH_API void THHeapUpdate(ptrdiff_t size);
-TH_API void THSetNumThreads(int num_threads);
-TH_API int THGetNumThreads(void);
-TH_API int THGetNumCores(void);
-TH_API void THInferNumThreads(void);
-
-#define THError(...) _THError(__FILE__, __LINE__, __VA_ARGS__)
-
-#define THCleanup(...) __VA_ARGS__
-
-#define THArgCheck(...)                                               \
-do {                                                                  \
-  _THArgCheck(__FILE__, __LINE__, __VA_ARGS__);                       \
-} while(0)
-
-#define THArgCheckWithCleanup(condition, cleanup, ...)                \
-do if (!(condition)) {                                                \
-  cleanup                                                             \
-  _THArgCheck(__FILE__, __LINE__, 0, __VA_ARGS__);                    \
-} while(0)
-
-#define THAssert(exp)                                                 \
-do {                                                                  \
-  if (!(exp)) {                                                       \
-    _THAssertionFailed(__FILE__, __LINE__, #exp, "");                 \
-  }                                                                   \
-} while(0)
-
-#define THAssertMsg(exp, ...)                                         \
-do {                                                                  \
-  if (!(exp)) {                                                       \
-    _THAssertionFailed(__FILE__, __LINE__, #exp, __VA_ARGS__);        \
-  }                                                                   \
-} while(0)
-
-#define TH_CONCAT_STRING_2(x,y) TH_CONCAT_STRING_2_EXPAND(x,y)
-#define TH_CONCAT_STRING_2_EXPAND(x,y) #x #y
-
-#define TH_CONCAT_STRING_3(x,y,z) TH_CONCAT_STRING_3_EXPAND(x,y,z)
-#define TH_CONCAT_STRING_3_EXPAND(x,y,z) #x #y #z
-
-#define TH_CONCAT_STRING_4(x,y,z,w) TH_CONCAT_STRING_4_EXPAND(x,y,z,w)
-#define TH_CONCAT_STRING_4_EXPAND(x,y,z,w) #x #y #z #w
-
-#define TH_CONCAT_2(x,y) TH_CONCAT_2_EXPAND(x,y)
-#define TH_CONCAT_2_EXPAND(x,y) x ## y
-
-#define TH_CONCAT_3(x,y,z) TH_CONCAT_3_EXPAND(x,y,z)
-#define TH_CONCAT_3_EXPAND(x,y,z) x ## y ## z
-
-#define TH_CONCAT_4_EXPAND(x,y,z,w) x ## y ## z ## w
-#define TH_CONCAT_4(x,y,z,w) TH_CONCAT_4_EXPAND(x,y,z,w)
-
-#define THMin(X, Y)  ((X) < (Y) ? (X) : (Y))
-#define THMax(X, Y)  ((X) > (Y) ? (X) : (Y))
-
-#if (defined(_MSC_VER) || defined(__MINGW32__))
-# define log1p(x) THLog1p(x)
-#define snprintf _snprintf
-#define popen _popen
-#define pclose _pclose
-#include <BaseTsd.h>
-typedef SSIZE_T ssize_t;
-#endif
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THGenerateAllTypes.h b/contrib/lua-torch/torch7/lib/TH/THGenerateAllTypes.h
deleted file mode 100644
index 5b9508df7..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THGenerateAllTypes.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#error "You must define TH_GENERIC_FILE before including THGenerateAllTypes.h"
-#endif
-
-#ifndef THGenerateManyTypes
-#define THAllLocalGenerateManyTypes
-#define THGenerateManyTypes
-#endif
-
-#include "THGenerateFloatTypes.h"
-#include "THGenerateIntTypes.h"
-
-#ifdef THAllLocalGenerateManyTypes
-#undef THAllLocalGenerateManyTypes
-#undef THGenerateManyTypes
-#undef TH_GENERIC_FILE
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THGenerateByteType.h b/contrib/lua-torch/torch7/lib/TH/THGenerateByteType.h
deleted file mode 100644
index 71ce7c405..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THGenerateByteType.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#error "You must define TH_GENERIC_FILE before including THGenerateByteType.h"
-#endif
-
-#define real unsigned char
-#define accreal long
-#define Real Byte
-#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
-#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
-#define THInf UCHAR_MAX
-#define TH_REAL_IS_BYTE
-#line 1 TH_GENERIC_FILE
-#include TH_GENERIC_FILE
-#undef real
-#undef accreal
-#undef Real
-#undef THInf
-#undef TH_REAL_IS_BYTE
-#undef TH_CONVERT_REAL_TO_ACCREAL
-#undef TH_CONVERT_ACCREAL_TO_REAL
-
-#ifndef THGenerateManyTypes
-#undef TH_GENERIC_FILE
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THGenerateCharType.h b/contrib/lua-torch/torch7/lib/TH/THGenerateCharType.h
deleted file mode 100644
index 158dd0e80..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THGenerateCharType.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#error "You must define TH_GENERIC_FILE before including THGenerateCharType.h"
-#endif
-
-#define real char
-#define accreal long
-#define Real Char
-#define THInf CHAR_MAX
-#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
-#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
-#define TH_REAL_IS_CHAR
-#line 1 TH_GENERIC_FILE
-#include TH_GENERIC_FILE
-#undef real
-#undef accreal
-#undef Real
-#undef THInf
-#undef TH_REAL_IS_CHAR
-#undef TH_CONVERT_REAL_TO_ACCREAL
-#undef TH_CONVERT_ACCREAL_TO_REAL
-
-#ifndef THGenerateManyTypes
-#undef TH_GENERIC_FILE
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THGenerateDoubleType.h b/contrib/lua-torch/torch7/lib/TH/THGenerateDoubleType.h
deleted file mode 100644
index fffee606d..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THGenerateDoubleType.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#error "You must define TH_GENERIC_FILE before including THGenerateDoubleType.h"
-#endif
-
-#define real double
-#define accreal double
-#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
-#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
-#define Real Double
-#define THInf DBL_MAX
-#define TH_REAL_IS_DOUBLE
-#line 1 TH_GENERIC_FILE
-#include TH_GENERIC_FILE
-#undef accreal
-#undef real
-#undef Real
-#undef THInf
-#undef TH_REAL_IS_DOUBLE
-#undef TH_CONVERT_REAL_TO_ACCREAL
-#undef TH_CONVERT_ACCREAL_TO_REAL
-
-#ifndef THGenerateManyTypes
-#undef TH_GENERIC_FILE
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THGenerateFloatType.h b/contrib/lua-torch/torch7/lib/TH/THGenerateFloatType.h
deleted file mode 100644
index a31b50c55..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THGenerateFloatType.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#error "You must define TH_GENERIC_FILE before including THGenerateFloatType.h"
-#endif
-
-#define real float
-#define accreal double
-#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
-#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
-#define Real Float
-#define THInf FLT_MAX
-#define TH_REAL_IS_FLOAT
-#line 1 TH_GENERIC_FILE
-#include TH_GENERIC_FILE
-#undef accreal
-#undef real
-#undef Real
-#undef THInf
-#undef TH_REAL_IS_FLOAT
-#undef TH_CONVERT_REAL_TO_ACCREAL
-#undef TH_CONVERT_ACCREAL_TO_REAL
-
-#ifndef THGenerateManyTypes
-#undef TH_GENERIC_FILE
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THGenerateFloatTypes.h b/contrib/lua-torch/torch7/lib/TH/THGenerateFloatTypes.h
deleted file mode 100644
index be5ea8403..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THGenerateFloatTypes.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#error "You must define TH_GENERIC_FILE before including THGenerateFloatTypes.h"
-#endif
-
-#ifndef THGenerateManyTypes
-#define THFloatLocalGenerateManyTypes
-#define THGenerateManyTypes
-#endif
-
-#include "THGenerateFloatType.h"
-#include "THGenerateDoubleType.h"
-
-#ifdef THFloatLocalGenerateManyTypes
-#undef THFloatLocalGenerateManyTypes
-#undef THGenerateManyTypes
-#undef TH_GENERIC_FILE
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THGenerateHalfType.h b/contrib/lua-torch/torch7/lib/TH/THGenerateHalfType.h
deleted file mode 100644
index 47ff1e8d7..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THGenerateHalfType.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#error "You must define TH_GENERIC_FILE before including THGenerateHalfType.h"
-#endif
-
-#include "THHalf.h"
-#define real THHalf
-#define accreal float
-#define TH_CONVERT_REAL_TO_ACCREAL(_val) TH_half2float(_val)
-#define TH_CONVERT_ACCREAL_TO_REAL(_val) TH_float2half(_val)
-#define Real Half
-#define THInf TH_HALF_BITS_TO_LITERAL(TH_HALF_INF)
-#define TH_REAL_IS_HALF
-#line 1 TH_GENERIC_FILE
-#include TH_GENERIC_FILE
-#undef real
-#undef accreal
-#undef Real
-#undef THInf
-#undef TH_REAL_IS_HALF
-#undef TH_CONVERT_REAL_TO_ACCREAL
-#undef TH_CONVERT_ACCREAL_TO_REAL
-
-#ifndef THGenerateManyTypes
-#undef TH_GENERIC_FILE
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THGenerateIntType.h b/contrib/lua-torch/torch7/lib/TH/THGenerateIntType.h
deleted file mode 100644
index 1562b9e98..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THGenerateIntType.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#error "You must define TH_GENERIC_FILE before including THGenerateIntType.h"
-#endif
-
-#define real int
-#define accreal long
-#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
-#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
-#define Real Int
-#define THInf INT_MAX
-#define TH_REAL_IS_INT
-#line 1 TH_GENERIC_FILE
-#include TH_GENERIC_FILE
-#undef real
-#undef accreal
-#undef Real
-#undef THInf
-#undef TH_REAL_IS_INT
-#undef TH_CONVERT_REAL_TO_ACCREAL
-#undef TH_CONVERT_ACCREAL_TO_REAL
-
-#ifndef THGenerateManyTypes
-#undef TH_GENERIC_FILE
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THGenerateIntTypes.h b/contrib/lua-torch/torch7/lib/TH/THGenerateIntTypes.h
deleted file mode 100644
index 9931fb1f5..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THGenerateIntTypes.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#error "You must define TH_GENERIC_FILE before including THGenerateIntTypes.h"
-#endif
-
-#ifndef THGenerateManyTypes
-#define THIntLocalGenerateManyTypes
-#define THGenerateManyTypes
-#endif
-
-#include "THGenerateByteType.h"
-#include "THGenerateCharType.h"
-#include "THGenerateShortType.h"
-#include "THGenerateIntType.h"
-#include "THGenerateLongType.h"
-
-#ifdef THIntLocalGenerateManyTypes
-#undef THIntLocalGenerateManyTypes
-#undef THGenerateManyTypes
-#undef TH_GENERIC_FILE
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THGenerateLongType.h b/contrib/lua-torch/torch7/lib/TH/THGenerateLongType.h
deleted file mode 100644
index 75f90e1a6..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THGenerateLongType.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#error "You must define TH_GENERIC_FILE before including THGenerateLongType.h"
-#endif
-
-#define real long
-#define accreal long
-#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
-#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
-#define Real Long
-#define THInf LONG_MAX
-#define TH_REAL_IS_LONG
-#line 1 TH_GENERIC_FILE
-#include TH_GENERIC_FILE
-#undef real
-#undef accreal
-#undef Real
-#undef THInf
-#undef TH_REAL_IS_LONG
-#undef TH_CONVERT_REAL_TO_ACCREAL
-#undef TH_CONVERT_ACCREAL_TO_REAL
-
-#ifndef THGenerateManyTypes
-#undef TH_GENERIC_FILE
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THGenerateShortType.h b/contrib/lua-torch/torch7/lib/TH/THGenerateShortType.h
deleted file mode 100644
index 047e51a8d..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THGenerateShortType.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#error "You must define TH_GENERIC_FILE before including THGenerateShortType.h"
-#endif
-
-#define real short
-#define accreal long
-#define TH_CONVERT_REAL_TO_ACCREAL(_val) (accreal)(_val)
-#define TH_CONVERT_ACCREAL_TO_REAL(_val) (real)(_val)
-#define Real Short
-#define THInf SHRT_MAX
-#define TH_REAL_IS_SHORT
-#line 1 TH_GENERIC_FILE
-#include TH_GENERIC_FILE
-#undef real
-#undef accreal
-#undef Real
-#undef THInf
-#undef TH_REAL_IS_SHORT
-#undef TH_CONVERT_REAL_TO_ACCREAL
-#undef TH_CONVERT_ACCREAL_TO_REAL
-
-#ifndef THGenerateManyTypes
-#undef TH_GENERIC_FILE
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THHalf.c b/contrib/lua-torch/torch7/lib/TH/THHalf.c
deleted file mode 100644
index d7468ac3d..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THHalf.c
+++ /dev/null
@@ -1,100 +0,0 @@
-#include "THHalf.h"
-
-/* Copyright 1993-2014 NVIDIA Corporation.  All rights reserved. */
-
-THHalf TH_float2half(float f)
-{
-  THHalf h;
-  TH_float2halfbits(&f, &h.x);
-  return h;
-}
-
-TH_API float TH_half2float(THHalf h)
-{
-  float f;
-  TH_halfbits2float(&h.x, &f);
-  return f;
-}
-
-// Host functions for converting between FP32 and FP16 formats
-
-void TH_halfbits2float(unsigned short* src, float* res)
-{
-    unsigned h = *src;
-    unsigned sign = ((h >> 15) & 1);
-    unsigned exponent = ((h >> 10) & 0x1f);
-    unsigned mantissa = ((h & 0x3ff) << 13);
-
-    if (exponent == 0x1f) {  /* NaN or Inf */
-        mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
-        exponent = 0xff;
-    } else if (!exponent) {  /* Denorm or Zero */
-        if (mantissa) {
-            unsigned int msb;
-            exponent = 0x71;
-            do {
-                msb = (mantissa & 0x400000);
-                mantissa <<= 1;  /* normalize */
-                --exponent;
-            } while (!msb);
-            mantissa &= 0x7fffff;  /* 1.mantissa is implicit */
-        }
-    } else {
-        exponent += 0x70;
-    }
-
-    *(unsigned*)res = ((sign << 31) | (exponent << 23) | mantissa);
-}
-
-void TH_float2halfbits(float* src, unsigned short* dest)
-{
-    unsigned x = *(unsigned*)src;
-    unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
-    unsigned sign, exponent, mantissa;
-
-    // Get rid of +NaN/-NaN case first.
-    if (u > 0x7f800000) {
-      *dest = 0x7fffU;
-      return ;
-    }
-
-    sign = ((x >> 16) & 0x8000);
-
-    // Get rid of +Inf/-Inf, +0/-0.
-    if (u > 0x477fefff) {
-      *dest = sign | 0x7c00U;
-      return;
-    }
-    if (u < 0x33000001) {
-      *dest = (sign | 0x0000);
-      return;
-    }
-
-    exponent = ((u >> 23) & 0xff);
-    mantissa = (u & 0x7fffff);
-
-    if (exponent > 0x70) {
-        shift = 13;
-        exponent -= 0x70;
-    } else {
-        shift = 0x7e - exponent;
-        exponent = 0;
-        mantissa |= 0x800000;
-    }
-    lsb = (1 << shift);
-    lsb_s1 = (lsb >> 1);
-    lsb_m1 = (lsb - 1);
-
-    // Round to nearest even.
-    remainder = (mantissa & lsb_m1);
-    mantissa >>= shift;
-    if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
-        ++mantissa;
-        if (!(mantissa & 0x3ff)) {
-            ++exponent;
-            mantissa = 0;
-        }
-    }
-
-    *dest = (sign | (exponent << 10) | mantissa);
-}
diff --git a/contrib/lua-torch/torch7/lib/TH/THHalf.h b/contrib/lua-torch/torch7/lib/TH/THHalf.h
deleted file mode 100644
index 0f9807b50..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THHalf.h
+++ /dev/null
@@ -1,41 +0,0 @@
-#ifndef TH_HALF_H
-#define TH_HALF_H
-
-#include "THGeneral.h"
-#include <stdint.h>
-
-/* Neither built-in nor included from Cutorch, use our definition lifted from CUDA */
-#if defined(__GNUC__)
-#define __thalign__(n) __attribute__((aligned(n)))
-#elif defined(_WIN32)
-#define __thalign__(n) __declspec(align(n))
-#else
-#define __thalign__(n)
-#endif
-
-typedef struct __thalign__(2){
-  unsigned short x;
-} __THHalf;
-
-typedef struct __thalign__(4) {
-  unsigned int x;
-} __THHalf2;
-
-typedef __THHalf THHalf;
-typedef __THHalf2 THHalf2;
-
-TH_API void TH_float2halfbits(float*, unsigned short*);
-TH_API void TH_halfbits2float(unsigned short*, float*);
-
-TH_API THHalf TH_float2half(float);
-TH_API float  TH_half2float(THHalf);
-
-#ifndef TH_HALF_BITS_TO_LITERAL
-# define TH_HALF_BITS_TO_LITERAL(n) { n }
-#endif
-
-#define TH_HALF_ZERO 0x0U
-#define TH_HALF_INF  0x7C00U
-
-#undef __thalign__
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THLapack.c b/contrib/lua-torch/torch7/lib/TH/THLapack.c
deleted file mode 100644
index bd4dc716b..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THLapack.c
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "THLapack.h"
-
-#include "generic/THLapack.c"
-#include "THGenerateFloatTypes.h"
diff --git a/contrib/lua-torch/torch7/lib/TH/THLapack.h b/contrib/lua-torch/torch7/lib/TH/THLapack.h
deleted file mode 100644
index 614d15f94..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THLapack.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef TH_LAPACK_INC
-#define TH_LAPACK_INC
-
-#include "THGeneral.h"
-
-#define THLapack_(NAME) TH_CONCAT_4(TH,Real,Lapack_,NAME)
-
-#define THLapackCheck(fmt, func, info , ...)						\
-if (info < 0) {														\
-  THError("Lapack Error in %s : Illegal Argument %d", func, -info); \
-} else if(info > 0) {												\
-  THError(fmt, func, info, ##__VA_ARGS__);							\
-}																	\
-
-#define THLapackCheckWithCleanup(fmt, cleanup, func, info , ...)    \
-if (info < 0) {                                                     \
-  cleanup                                                           \
-  THError("Lapack Error in %s : Illegal Argument %d", func, -info); \
-} else if(info > 0) {                                               \
-  cleanup                                                           \
-  THError(fmt, func, info, ##__VA_ARGS__);                          \
-}
-
-#include "generic/THLapack.h"
-#include "THGenerateAllTypes.h"
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THLogAdd.c b/contrib/lua-torch/torch7/lib/TH/THLogAdd.c
deleted file mode 100644
index 4b14f8540..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THLogAdd.c
+++ /dev/null
@@ -1,88 +0,0 @@
-#include "THLogAdd.h"
-
-#include <float.h>
-
-#ifdef USE_DOUBLE
-#define MINUS_LOG_THRESHOLD -39.14
-#else
-#define MINUS_LOG_THRESHOLD -18.42
-#endif
-
-const double THLog2Pi=1.83787706640934548355;
-const double THLogZero=-DBL_MAX;
-const double THLogOne=0;
-
-double THLogAdd(double log_a, double log_b)
-{
-  double minusdif;
-
-  if (log_a < log_b)
-  {
-    double tmp = log_a;
-    log_a = log_b;
-    log_b = tmp;
-  }
-
-  minusdif = log_b - log_a;
-#ifdef DEBUG
-  if (isnan(minusdif))
-    THError("THLogAdd: minusdif (%f) log_b (%f) or log_a (%f) is nan", minusdif, log_b, log_a);
-#endif
-  if (minusdif < MINUS_LOG_THRESHOLD)
-    return log_a;
-  else
-    return log_a + log1p(exp(minusdif));
-}
-
-double THLogSub(double log_a, double log_b)
-{
-  double minusdif;
-
-  if (log_a < log_b)
-    THError("LogSub: log_a (%f) should be greater than log_b (%f)", log_a, log_b);
-
-  minusdif = log_b - log_a;
-#ifdef DEBUG
-  if (isnan(minusdif))
-    THError("LogSub: minusdif (%f) log_b (%f) or log_a (%f) is nan", minusdif, log_b, log_a);
-#endif
-  if (log_a == log_b)
-    return THLogZero;
-  else if (minusdif < MINUS_LOG_THRESHOLD)
-    return log_a;
-  else
-    return log_a + log1p(-exp(minusdif));
-}
-
-/* Credits to Leon Bottou */
-double THExpMinusApprox(const double x)
-{
-#define EXACT_EXPONENTIAL 0
-#if EXACT_EXPONENTIAL
-  return exp(-x);
-#else
-  /* fast approximation of exp(-x) for x positive */
-# define A0   (1.0)
-# define A1   (0.125)
-# define A2   (0.0078125)
-# define A3   (0.00032552083)
-# define A4   (1.0172526e-5)
-  if (x < 13.0)
-  {
-/*    assert(x>=0); */
-    double y;
-    y = A0+x*(A1+x*(A2+x*(A3+x*A4)));
-    y *= y;
-    y *= y;
-    y *= y;
-    y = 1/y;
-    return y;
-  }
-  return 0;
-# undef A0
-# undef A1
-# undef A2
-# undef A3
-# undef A4
-#endif
-}
diff --git a/contrib/lua-torch/torch7/lib/TH/THLogAdd.h b/contrib/lua-torch/torch7/lib/TH/THLogAdd.h
deleted file mode 100644
index 9319b8f46..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THLogAdd.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef TH_LOG_ADD_INC
-#define TH_LOG_ADD_INC
-
-#include "THGeneral.h"
-
-TH_API const double THLog2Pi;
-TH_API const double THLogZero;
-TH_API const double THLogOne;
-
-TH_API double THLogAdd(double log_a, double log_b);
-TH_API double THLogSub(double log_a, double log_b);
-TH_API double THExpMinusApprox(const double x);
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THMath.h b/contrib/lua-torch/torch7/lib/TH/THMath.h
deleted file mode 100644
index 004e4fe45..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THMath.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef _THMATH_H
-#define _THMATH_H
-
-static inline double TH_sigmoid(double value) {
-  return 1.0 / (1.0 + exp(-value));
-}
-
-static inline double TH_frac(double x) {
-  return x - trunc(x);
-}
-
-static inline double TH_rsqrt(double x) {
-  return 1.0 / sqrt(x);
-}
-
-static inline double TH_lerp(double a, double b, double weight) {
-  return a + weight * (b-a);
-}
-
-static inline float TH_sigmoidf(float value) {
-  return 1.0f / (1.0f + expf(-value));
-}
-
-static inline float TH_fracf(float x) {
-  return x - truncf(x);
-}
-
-static inline float TH_rsqrtf(float x) {
-  return 1.0f / sqrtf(x);
-}
-
-static inline float TH_lerpf(float a, float b, float weight) {
-  return a + weight * (b-a);
-}
-
-#endif // _THMATH_H
diff --git a/contrib/lua-torch/torch7/lib/TH/THMemoryFile.c b/contrib/lua-torch/torch7/lib/TH/THMemoryFile.c
deleted file mode 100644
index ecce6e1b1..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THMemoryFile.c
+++ /dev/null
@@ -1,685 +0,0 @@
-#include "THMemoryFile.h"
-#include "THFilePrivate.h"
-#include "stdint.h"
-
-typedef struct THMemoryFile__
-{
-    THFile file;
-    THCharStorage *storage;
-    size_t size;
-    size_t position;
-	int longSize;
-
-} THMemoryFile;
-
-static int THMemoryFile_isOpened(THFile *self)
-{
-  THMemoryFile *mfself = (THMemoryFile*)self;
-  return (mfself->storage != NULL);
-}
-
-static char *THMemoryFile_strnextspace(char *str_, char *c_)
-{
-  char c;
-
-  while( (c = *str_) )
-  {
-    if( (c != ' ') && (c != '\n') && (c != ':') && (c != ';') )
-      break;
-    str_++;
-  }
-
-  while( (c = *str_) )
-  {
-    if( (c == ' ') || (c == '\n') || (c == ':') || (c == ';') )
-    {
-      *c_ = c;
-      *str_ = '\0';
-      return(str_);
-    }
-    str_++;
-  }
-  return NULL;
-}
-
-static void THMemoryFile_grow(THMemoryFile *self, size_t size)
-{
-  size_t missingSpace;
-
-  if(size <= self->size)
-    return;
-  else
-  {
-    if(size < self->storage->size) /* note the "<" and not "<=" */
-    {
-      self->size = size;
-      self->storage->data[self->size] = '\0';
-      return;
-    }
-  }
-
-  missingSpace = size-self->storage->size+1; /* +1 for the '\0' */
-  THCharStorage_resize(self->storage, (self->storage->size/2 > missingSpace ?
-                                       self->storage->size + (self->storage->size/2)
-                                       : self->storage->size + missingSpace));
-}
-
-static int THMemoryFile_mode(const char *mode, int *isReadable, int *isWritable)
-{
-  *isReadable = 0;
-  *isWritable = 0;
-  if(strlen(mode) == 1)
-  {
-    if(*mode == 'r')
-    {
-      *isReadable = 1;
-      return 1;
-    }
-    else if(*mode == 'w')
-    {
-      *isWritable = 1;
-      return 1;
-    }
-  }
-  else if(strlen(mode) == 2)
-  {
-    if(mode[0] == 'r' && mode[1] == 'w')
-    {
-      *isReadable = 1;
-      *isWritable = 1;
-      return 1;
-    }
-  }
-  return 0;
-}
-
-/********************************************************/
-
-#define READ_WRITE_METHODS(TYPE, TYPEC, ASCII_READ_ELEM, ASCII_WRITE_ELEM, INSIDE_SPACING) \
-  static size_t THMemoryFile_read##TYPEC(THFile *self, TYPE *data, size_t n) \
-  {                                                                     \
-    THMemoryFile *mfself = (THMemoryFile*)self;                         \
-    size_t nread = 0;                                                    \
-                                                                        \
-    THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");     \
-    THArgCheck(mfself->file.isReadable, 1, "attempt to read in a write-only file"); \
-                                                                        \
-    if (n == 0)                                                         \
-        return 0;                                                       \
-                                                                        \
-    if(mfself->file.isBinary)                                           \
-    {                                                                   \
-      size_t nByte = sizeof(TYPE)*n;                                      \
-      size_t nByteRemaining = (mfself->position + nByte <= mfself->size ? nByte : mfself->size-mfself->position); \
-      nread = nByteRemaining/sizeof(TYPE);                              \
-      memmove(data, mfself->storage->data+mfself->position, nread*sizeof(TYPE)); \
-      mfself->position += nread*sizeof(TYPE);                           \
-    }                                                                   \
-    else                                                                \
-    {                                                                   \
-      size_t i;                                                           \
-      for(i = 0; i < n; i++)                                            \
-      {                                                                 \
-        size_t nByteRead = 0;                                             \
-        char spaceChar = 0;                                             \
-        char *spacePtr = THMemoryFile_strnextspace(mfself->storage->data+mfself->position, &spaceChar); \
-        ASCII_READ_ELEM;                                                \
-        if(ret == EOF)                                                  \
-        {                                                               \
-          while(mfself->storage->data[mfself->position])                \
-            mfself->position++;                                         \
-        }                                                               \
-        else                                                            \
-          mfself->position += nByteRead;                                \
-        if(spacePtr)                                                    \
-          *spacePtr = spaceChar;                                        \
-      }                                                                 \
-      if(mfself->file.isAutoSpacing && (n > 0))                         \
-      {                                                                 \
-        if( (mfself->position < mfself->size) && (mfself->storage->data[mfself->position] == '\n') ) \
-          mfself->position++;                                           \
-      }                                                                 \
-    }                                                                   \
-                                                                        \
-    if(nread != n)                                                      \
-    {                                                                   \
-      mfself->file.hasError = 1; /* shouldn't we put hasError to 0 all the time ? */ \
-      if(!mfself->file.isQuiet)                                         \
-        THError("read error: read %d blocks instead of %d", nread, n);  \
-    }                                                                   \
-                                                                        \
-    return nread;                                                       \
-  }                                                                     \
-                                                                        \
-  static size_t THMemoryFile_write##TYPEC(THFile *self, TYPE *data, size_t n) \
-  {                                                                     \
-    THMemoryFile *mfself = (THMemoryFile*)self;                         \
-                                                                        \
-    THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");     \
-    THArgCheck(mfself->file.isWritable, 1, "attempt to write in a read-only file"); \
-                                                                        \
-    if (n == 0)                                                         \
-        return 0;                                                       \
-                                                                        \
-    if(mfself->file.isBinary)                                           \
-    {                                                                   \
-      size_t nByte = sizeof(TYPE)*n;                                      \
-      THMemoryFile_grow(mfself, mfself->position+nByte);                \
-      memmove(mfself->storage->data+mfself->position, data, nByte);     \
-      mfself->position += nByte;                                        \
-      if(mfself->position > mfself->size)                               \
-      {                                                                 \
-        mfself->size = mfself->position;                                \
-        mfself->storage->data[mfself->size] = '\0';                     \
-      }                                                                 \
-    }                                                                   \
-    else                                                                \
-    {                                                                   \
-      size_t i;                                                           \
-      for(i = 0; i < n; i++)                                            \
-      {                                                                 \
-        ssize_t nByteWritten;                                           \
-        while (1)                                                       \
-        {                                                               \
-          ASCII_WRITE_ELEM;                                             \
-          if( (nByteWritten > -1) && (nByteWritten < mfself->storage->size-mfself->position) ) \
-          {                                                             \
-            mfself->position += nByteWritten;                           \
-            break;                                                      \
-          }                                                             \
-          THMemoryFile_grow(mfself, mfself->storage->size + (mfself->storage->size/2) + 2); \
-        }                                                               \
-        if(mfself->file.isAutoSpacing)                                  \
-        {                                                               \
-          if(i < n-1)                                                   \
-          {                                                             \
-            THMemoryFile_grow(mfself, mfself->position+1);              \
-            sprintf(mfself->storage->data+mfself->position, " ");       \
-            mfself->position++;                                         \
-          }                                                             \
-          if(i == n-1)                                                  \
-          {                                                             \
-            THMemoryFile_grow(mfself, mfself->position+1);              \
-            sprintf(mfself->storage->data+mfself->position, "\n");      \
-            mfself->position++;                                         \
-          }                                                             \
-        }                                                               \
-      }                                                                 \
-      if(mfself->position > mfself->size)                               \
-      {                                                                 \
-        mfself->size = mfself->position;                                \
-        mfself->storage->data[mfself->size] = '\0';                     \
-      }                                                                 \
-    }                                                                   \
-                                                                        \
-    return n;                                                           \
-  }
-
-
-void THMemoryFile_longSize(THFile *self, int size)
-{
-  THMemoryFile *dfself = (THMemoryFile*)(self);
-  THArgCheck(size == 0 || size == 4 || size == 8, 1, "Invalid long size specified");
-  dfself->longSize = size;
-}
-
-THCharStorage *THMemoryFile_storage(THFile *self)
-{
-  THMemoryFile *mfself = (THMemoryFile*)self;
-  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
-
-  THCharStorage_resize(mfself->storage, mfself->size+1);
-
-  return mfself->storage;
-}
-
-static void THMemoryFile_synchronize(THFile *self)
-{
-  THMemoryFile *mfself = (THMemoryFile*)self;
-  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
-}
-
-static void THMemoryFile_seek(THFile *self, size_t position)
-{
-  THMemoryFile *mfself = (THMemoryFile*)self;
-
-  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
-  THArgCheck(position >= 0, 2, "position must be positive");
-
-  if(position <= mfself->size)
-    mfself->position = position;
-  else
-  {
-    mfself->file.hasError = 1;
-    if(!mfself->file.isQuiet)
-      THError("unable to seek at position %zu", position);
-  }
-}
-
-static void THMemoryFile_seekEnd(THFile *self)
-{
-  THMemoryFile *mfself = (THMemoryFile*)self;
-  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
-
-  mfself->position = mfself->size;
-}
-
-static size_t THMemoryFile_position(THFile *self)
-{
-  THMemoryFile *mfself = (THMemoryFile*)self;
-  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
-  return mfself->position;
-}
-
-static void THMemoryFile_close(THFile *self)
-{
-  THMemoryFile *mfself = (THMemoryFile*)self;
-  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
-  THCharStorage_free(mfself->storage);
-  mfself->storage = NULL;
-}
-
-static void THMemoryFile_free(THFile *self)
-{
-  THMemoryFile *mfself = (THMemoryFile*)self;
-
-  if(mfself->storage)
-    THCharStorage_free(mfself->storage);
-
-  THFree(mfself);
-}
-
-/* READ_WRITE_METHODS(bool, Bool, */
-/*                    int value = 0; int ret = sscanf(mfself->storage->data+mfself->position, "%d%n", &value, &nByteRead); data[i] = (value ? 1 : 0), */
-/*                    int value = (data[i] ? 1 : 0); nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%d", value), */
-/*                    1) */
-
-READ_WRITE_METHODS(unsigned char, Byte,
-                   size_t ret = (mfself->position + n <= mfself->size ? n : mfself->size-mfself->position);  \
-                   if(spacePtr) *spacePtr = spaceChar; \
-                   nByteRead = ret; \
-                   nread = ret; \
-                   i = n-1; \
-                   memmove(data, mfself->storage->data+mfself->position, nByteRead),
-                   nByteWritten = (n < mfself->storage->size-mfself->position ? n : -1); \
-                   i = n-1; \
-                   if(nByteWritten > -1)
-                     memmove(mfself->storage->data+mfself->position, data, nByteWritten),
-                   0)
-
-/* DEBUG: we should check if %n is count or not as a element (so ret might need to be ret-- on some systems) */
-/* Note that we do a trick for char */
-READ_WRITE_METHODS(char, Char,
-                   size_t ret = (mfself->position + n <= mfself->size ? n : mfself->size-mfself->position);  \
-                   if(spacePtr) *spacePtr = spaceChar; \
-                   nByteRead = ret; \
-                   nread = ret; \
-                   i = n-1; \
-                   memmove(data, mfself->storage->data+mfself->position, nByteRead),
-                   nByteWritten = (n < mfself->storage->size-mfself->position ? n : -1); \
-                   i = n-1; \
-                   if(nByteWritten > -1)
-                     memmove(mfself->storage->data+mfself->position, data, nByteWritten),
-                   0)
-
-READ_WRITE_METHODS(short, Short,
-                   int nByteRead_; int ret = sscanf(mfself->storage->data+mfself->position, "%hd%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
-                   nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%hd", data[i]),
-                   1)
-
-READ_WRITE_METHODS(int, Int,
-                   int nByteRead_; int ret = sscanf(mfself->storage->data+mfself->position, "%d%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
-                   nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%d", data[i]),
-                   1)
-
-READ_WRITE_METHODS(float, Float,
-                   int nByteRead_; int ret = sscanf(mfself->storage->data+mfself->position, "%g%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
-                   nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%.9g", data[i]),
-                   1)
-
-READ_WRITE_METHODS(THHalf, Half,
-                   int nByteRead_; float buf; \
-                   int ret = sscanf(mfself->storage->data+mfself->position, "%g%n", &buf, &nByteRead_); \
-                   data[i] = TH_float2half(buf); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
-                   nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%.9g", TH_half2float(data[i])),
-                   1)
-
-READ_WRITE_METHODS(double, Double,
-                   int nByteRead_; int ret = sscanf(mfself->storage->data+mfself->position, "%lg%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++,
-                   nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%.17g", data[i]),
-                   1)
-
-int THDiskFile_isLittleEndianCPU(void);
-
-static size_t THMemoryFile_readLong(THFile *self, long *data, size_t n)
-{
-  THMemoryFile *mfself = (THMemoryFile*)self;
-  size_t nread = 0L;
-
-  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
-  THArgCheck(mfself->file.isReadable, 1, "attempt to read in a write-only file");
-
-  if (n == 0)
-    return 0;
-
-  if(mfself->file.isBinary)
-  {
-    if(mfself->longSize == 0 || mfself->longSize == sizeof(long))
-    {
-      size_t nByte = sizeof(long)*n;
-      size_t nByteRemaining = (mfself->position + nByte <= mfself->size ? nByte : mfself->size-mfself->position);
-      nread = nByteRemaining/sizeof(long);
-      memmove(data, mfself->storage->data+mfself->position, nread*sizeof(long));
-      mfself->position += nread*sizeof(long);
-    } else if(mfself->longSize == 4)
-    {
-      size_t nByte = 4*n;
-      size_t nByteRemaining = (mfself->position + nByte <= mfself->size ? nByte : mfself->size-mfself->position);
-      int32_t *storage = (int32_t *)(mfself->storage->data + mfself->position);
-      nread = nByteRemaining/4;
-      size_t i;
-      for(i = 0; i < nread; i++)
-        data[i] = storage[i];
-      mfself->position += nread*4;
-    }
-    else /* if(mfself->longSize == 8) */
-    {
-      int big_endian = !THDiskFile_isLittleEndianCPU();
-      size_t nByte = 8*n;
-      int32_t *storage = (int32_t *)(mfself->storage->data + mfself->position);
-      size_t nByteRemaining = (mfself->position + nByte <= mfself->size ? nByte : mfself->size-mfself->position);
-      nread = nByteRemaining/8;
-      size_t i;
-      for(i = 0; i < nread; i++)
-        data[i] = storage[2*i + big_endian];
-      mfself->position += nread*8;
-    }
-  }
-  else
-  {
-    size_t i;
-    for(i = 0; i < n; i++)
-    {
-      size_t nByteRead = 0;
-      char spaceChar = 0;
-      char *spacePtr = THMemoryFile_strnextspace(mfself->storage->data+mfself->position, &spaceChar);
-      int nByteRead_; int ret = sscanf(mfself->storage->data+mfself->position, "%ld%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++;
-      if(ret == EOF)
-      {
-        while(mfself->storage->data[mfself->position])
-          mfself->position++;
-      }
-      else
-        mfself->position += nByteRead;
-      if(spacePtr)
-        *spacePtr = spaceChar;
-    }
-    if(mfself->file.isAutoSpacing && (n > 0))
-    {
-      if( (mfself->position < mfself->size) && (mfself->storage->data[mfself->position] == '\n') )
-        mfself->position++;
-    }
-  }
-
-  if(nread != n)
-  {
-    mfself->file.hasError = 1; /* shouldn't we put hasError to 0 all the time ? */
-    if(!mfself->file.isQuiet)
-      THError("read error: read %d blocks instead of %d", nread, n);
-  }
-
-  return nread;
-}
-
-static size_t THMemoryFile_writeLong(THFile *self, long *data, size_t n)
-{
-  THMemoryFile *mfself = (THMemoryFile*)self;
-
-  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
-  THArgCheck(mfself->file.isWritable, 1, "attempt to write in a read-only file");
-
-  if (n == 0)
-    return 0;
-
-  if(mfself->file.isBinary)
-  {
-    if(mfself->longSize == 0 || mfself->longSize == sizeof(long))
-    {
-      size_t nByte = sizeof(long)*n;
-      THMemoryFile_grow(mfself, mfself->position+nByte);
-      memmove(mfself->storage->data+mfself->position, data, nByte);
-      mfself->position += nByte;
-    } else if(mfself->longSize == 4)
-    {
-      size_t nByte = 4*n;
-      THMemoryFile_grow(mfself, mfself->position+nByte);
-      int32_t *storage = (int32_t *)(mfself->storage->data + mfself->position);
-      size_t i;
-      for(i = 0; i < n; i++)
-        storage[i] = data[i];
-      mfself->position += nByte;
-    }
-    else /* if(mfself->longSize == 8) */
-    {
-      int big_endian = !THDiskFile_isLittleEndianCPU();
-      size_t nByte = 8*n;
-      THMemoryFile_grow(mfself, mfself->position+nByte);
-      int32_t *storage = (int32_t *)(mfself->storage->data + mfself->position);
-      size_t i;
-      for(i = 0; i < n; i++)
-      {
-        storage[2*i + !big_endian] = 0;
-        storage[2*i + big_endian] = data[i];
-      }
-      mfself->position += nByte;
-    }
-    if(mfself->position > mfself->size)
-    {
-      mfself->size = mfself->position;
-      mfself->storage->data[mfself->size] = '\0';
-    }
-  }
-  else
-  {
-    size_t i;
-    for(i = 0; i < n; i++)
-    {
-      ssize_t nByteWritten;
-      while (1)
-      {
-        nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%ld", data[i]);
-        if( (nByteWritten > -1) && (nByteWritten < mfself->storage->size-mfself->position) )
-        {
-          mfself->position += nByteWritten;
-          break;
-        }
-        THMemoryFile_grow(mfself, mfself->storage->size + (mfself->storage->size/2) + 2);
-      }
-      if(mfself->file.isAutoSpacing)
-      {
-        if(i < n-1)
-        {
-          THMemoryFile_grow(mfself, mfself->position+1);
-          sprintf(mfself->storage->data+mfself->position, " ");
-          mfself->position++;
-        }
-        if(i == n-1)
-        {
-          THMemoryFile_grow(mfself, mfself->position+1);
-          sprintf(mfself->storage->data+mfself->position, "\n");
-          mfself->position++;
-        }
-      }
-    }
-    if(mfself->position > mfself->size)
-    {
-      mfself->size = mfself->position;
-      mfself->storage->data[mfself->size] = '\0';
-    }
-  }
-
-  return n;
-}
-
-static char* THMemoryFile_cloneString(const char *str, ptrdiff_t size)
-{
-  char *cstr = THAlloc(size);
-  memcpy(cstr, str, size);
-  return cstr;
-}
-
-static size_t THMemoryFile_readString(THFile *self, const char *format, char **str_)
-{
-  THMemoryFile *mfself = (THMemoryFile*)self;
-
-  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
-  THArgCheck(mfself->file.isReadable, 1, "attempt to read in a write-only file");
-  THArgCheck((strlen(format) >= 2 ? (format[0] == '*') && (format[1] == 'a' || format[1] == 'l') : 0), 2, "format must be '*a' or '*l'");
-
-  if(mfself->position == mfself->size) /* eof ? */
-  {
-    mfself->file.hasError = 1;
-    if(!mfself->file.isQuiet)
-      THError("read error: read 0 blocks instead of 1");
-
-    *str_ = NULL;
-    return 0;
-  }
-
-  if(format[1] == 'a')
-  {
-    size_t str_size = mfself->size-mfself->position;
-
-    *str_ = THMemoryFile_cloneString(mfself->storage->data+mfself->position, str_size);
-    mfself->position = mfself->size;
-
-    return str_size;
-  }
-  else
-  {
-    char *p = mfself->storage->data+mfself->position;
-    int eolFound = 0;
-    size_t posEol;
-    size_t i;
-    for(i = 0; i < mfself->size-mfself->position; i++)
-    {
-      if(p[i] == '\n')
-      {
-        posEol = i;
-        eolFound = 1;
-        break;
-      }
-    }
-
-    if(eolFound)
-    {
-      *str_ = THMemoryFile_cloneString(mfself->storage->data+mfself->position, posEol);
-      mfself->position += posEol+1;
-      return posEol;
-    }
-    else /* well, we read all! */
-    {
-      size_t str_size = mfself->size-mfself->position;
-
-      *str_ = THMemoryFile_cloneString(mfself->storage->data+mfself->position, str_size);
-      mfself->position = mfself->size;
-
-      return str_size;
-    }
-  }
-
-  *str_ = NULL;
-  return 0;
-}
-
-static size_t THMemoryFile_writeString(THFile *self, const char *str, size_t size)
-{
-  THMemoryFile *mfself = (THMemoryFile*)self;
-
-  THArgCheck(mfself->storage != NULL, 1, "attempt to use a closed file");
-  THArgCheck(mfself->file.isWritable, 1, "attempt to write in a read-only file");
-
-  THMemoryFile_grow(mfself, mfself->position+size);
-  memmove(mfself->storage->data+mfself->position, str, size);
-  mfself->position += size;
-  if(mfself->position > mfself->size)
-  {
-    mfself->size = mfself->position;
-    mfself->storage->data[mfself->size] = '\0';
-  }
-
-  return size;
-}
-
-THFile *THMemoryFile_newWithStorage(THCharStorage *storage, const char *mode)
-{
-  static struct THFileVTable vtable = {
-    THMemoryFile_isOpened,
-
-    THMemoryFile_readByte,
-    THMemoryFile_readChar,
-    THMemoryFile_readShort,
-    THMemoryFile_readInt,
-    THMemoryFile_readLong,
-    THMemoryFile_readFloat,
-    THMemoryFile_readDouble,
-    THMemoryFile_readHalf,
-    THMemoryFile_readString,
-
-    THMemoryFile_writeByte,
-    THMemoryFile_writeChar,
-    THMemoryFile_writeShort,
-    THMemoryFile_writeInt,
-    THMemoryFile_writeLong,
-    THMemoryFile_writeFloat,
-    THMemoryFile_writeDouble,
-    THMemoryFile_writeHalf,
-    THMemoryFile_writeString,
-
-    THMemoryFile_synchronize,
-    THMemoryFile_seek,
-    THMemoryFile_seekEnd,
-    THMemoryFile_position,
-    THMemoryFile_close,
-    THMemoryFile_free
-  };
-
-  THMemoryFile *mfself;
-  int isReadable;
-  int isWritable;
-
-  if(storage)
-  {
-    THArgCheck(storage->data[storage->size-1] == '\0', 1, "provided CharStorage must be terminated by 0");
-    THArgCheck(THMemoryFile_mode(mode, &isReadable, &isWritable), 2, "file mode should be 'r','w' or 'rw'");
-    THCharStorage_retain(storage);
-  }
-  else
-  {
-    THArgCheck(THMemoryFile_mode(mode, &isReadable, &isWritable), 2, "file mode should be 'r','w' or 'rw'");
-    storage = THCharStorage_newWithSize(1);
-    storage->data[0] = '\0';
-  }
-
-  mfself = THAlloc(sizeof(THMemoryFile));
-
-  mfself->storage = storage;
-  mfself->size = (storage ? storage->size-1 : 0);
-  mfself->position = 0;
-  mfself->longSize = 0;
-
-  mfself->file.vtable = &vtable;
-  mfself->file.isQuiet = 0;
-  mfself->file.isReadable = isReadable;
-  mfself->file.isWritable = isWritable;
-  mfself->file.isBinary = 0;
-  mfself->file.isAutoSpacing = 1;
-  mfself->file.hasError = 0;
-
-  return (THFile*)mfself;
-}
-
-THFile *THMemoryFile_new(const char *mode)
-{
-  return THMemoryFile_newWithStorage(NULL, mode);
-}
diff --git a/contrib/lua-torch/torch7/lib/TH/THMemoryFile.h b/contrib/lua-torch/torch7/lib/TH/THMemoryFile.h
deleted file mode 100644
index b54cdcc2f..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THMemoryFile.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef TH_MEMORY_FILE_INC
-#define TH_MEMORY_FILE_INC
-
-#include "THFile.h"
-#include "THStorage.h"
-
-TH_API THFile *THMemoryFile_newWithStorage(THCharStorage *storage, const char *mode);
-TH_API THFile *THMemoryFile_new(const char *mode);
-
-TH_API THCharStorage *THMemoryFile_storage(THFile *self);
-TH_API void THMemoryFile_longSize(THFile *self, int size);
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THRandom.c b/contrib/lua-torch/torch7/lib/TH/THRandom.c
deleted file mode 100644
index 86d721e7b..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THRandom.c
+++ /dev/null
@@ -1,272 +0,0 @@
-#include "THGeneral.h"
-#include "THRandom.h"
-
-/* Code for the Mersenne Twister random generator.... */
-#define n _MERSENNE_STATE_N
-#define m _MERSENNE_STATE_M
-
-/* Creates (unseeded) new generator*/
-static THGenerator* THGenerator_newUnseeded(void)
-{
-  THGenerator *self = THAlloc(sizeof(THGenerator));
-  memset(self, 0, sizeof(THGenerator));
-  self->left = 1;
-  self->seeded = 0;
-  self->normal_is_valid = 0;
-  return self;
-}
-
-/* Creates new generator and makes sure it is seeded*/
-THGenerator* THGenerator_new(void)
-{
-  THGenerator *self = THGenerator_newUnseeded();
-  THRandom_seed(self);
-  return self;
-}
-
-THGenerator* THGenerator_copy(THGenerator *self, THGenerator *from)
-{
-    memcpy(self, from, sizeof(THGenerator));
-    return self;
-}
-
-void THGenerator_free(THGenerator *self)
-{
-  THFree(self);
-}
-
-int THGenerator_isValid(THGenerator *_generator)
-{
-  if ((_generator->seeded == 1) &&
-    (_generator->left > 0 && _generator->left <= n) && (_generator->next <= n))
-    return 1;
-
-  return 0;
-}
-
-#ifndef _WIN32
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-
-static unsigned long readURandomLong()
-{
-  int randDev = open("/dev/urandom", O_RDONLY);
-  unsigned long randValue;
-  if (randDev < 0) {
-    THError("Unable to open /dev/urandom");
-  }
-  ssize_t readBytes = read(randDev, &randValue, sizeof(randValue));
-  if (readBytes < sizeof(randValue)) {
-    THError("Unable to read from /dev/urandom");
-  }
-  close(randDev);
-  return randValue;
-}
-#endif // _WIN32
-
-unsigned long THRandom_seed(THGenerator *_generator)
-{
-#ifdef _WIN32
-  unsigned long s = (unsigned long)time(0);
-#else
-  unsigned long s = readURandomLong();
-#endif
-  THRandom_manualSeed(_generator, s);
-  return s;
-}
-
-/* The next 4 methods are taken from http:www.math.keio.ac.jpmatumotoemt.html
-   Here is the copyright:
-   Some minor modifications have been made to adapt to "my" C... */
-
-/*
-   A C-program for MT19937, with initialization improved 2002/2/10.
-   Coded by Takuji Nishimura and Makoto Matsumoto.
-   This is a faster version by taking Shawn Cokus's optimization,
-   Matthe Bellew's simplification, Isaku Wada's double version.
-
-   Before using, initialize the state by using init_genrand(seed)
-   or init_by_array(init_key, key_length).
-
-   Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions
-   are met:
-
-     1. Redistributions of source code must retain the above copyright
-        notice, this list of conditions and the following disclaimer.
-
-     2. Redistributions in binary form must reproduce the above copyright
-        notice, this list of conditions and the following disclaimer in the
-        documentation and/or other materials provided with the distribution.
-
-     3. The names of its contributors may not be used to endorse or promote
-        products derived from this software without specific prior written
-        permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-   Any feedback is very welcome.
-   http://www.math.keio.ac.jp/matumoto/emt.html
-   email: matumoto@math.keio.ac.jp
-*/
-
-/* Macros for the Mersenne Twister random generator... */
-/* Period parameters */
-/* #define n 624 */
-/* #define m 397 */
-#define MATRIX_A 0x9908b0dfUL   /* constant vector a */
-#define UMASK 0x80000000UL /* most significant w-r bits */
-#define LMASK 0x7fffffffUL /* least significant r bits */
-#define MIXBITS(u,v) ( ((u) & UMASK) | ((v) & LMASK) )
-#define TWIST(u,v) ((MIXBITS(u,v) >> 1) ^ ((v)&1UL ? MATRIX_A : 0UL))
-/*********************************************************** That's it. */
-
-void THRandom_manualSeed(THGenerator *_generator, unsigned long the_seed_)
-{
-  int j;
-
-  /* This ensures reseeding resets all of the state (i.e. state for Gaussian numbers) */
-  THGenerator *blank = THGenerator_newUnseeded();
-  THGenerator_copy(_generator, blank);
-  THGenerator_free(blank);
-
-  _generator->the_initial_seed = the_seed_;
-  _generator->state[0] = _generator->the_initial_seed & 0xffffffffUL;
-  for(j = 1; j < n; j++)
-  {
-    _generator->state[j] = (1812433253UL * (_generator->state[j-1] ^ (_generator->state[j-1] >> 30)) + j);
-    /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
-    /* In the previous versions, mSBs of the seed affect   */
-    /* only mSBs of the array state[].                        */
-    /* 2002/01/09 modified by makoto matsumoto             */
-    _generator->state[j] &= 0xffffffffUL;  /* for >32 bit machines */
-  }
-  _generator->left = 1;
-  _generator->seeded = 1;
-}
-
-unsigned long THRandom_initialSeed(THGenerator *_generator)
-{
-  return _generator->the_initial_seed;
-}
-
-void THRandom_nextState(THGenerator *_generator)
-{
-  unsigned long *p = _generator->state;
-  int j;
-
-  _generator->left = n;
-  _generator->next = 0;
-
-  for(j = n-m+1; --j; p++)
-    *p = p[m] ^ TWIST(p[0], p[1]);
-
-  for(j = m; --j; p++)
-    *p = p[m-n] ^ TWIST(p[0], p[1]);
-
-  *p = p[m-n] ^ TWIST(p[0], _generator->state[0]);
-}
-
-unsigned long THRandom_random(THGenerator *_generator)
-{
-  unsigned long y;
-
-  if (--(_generator->left) == 0)
-    THRandom_nextState(_generator);
-  y = *(_generator->state + (_generator->next)++);
-
-  /* Tempering */
-  y ^= (y >> 11);
-  y ^= (y << 7) & 0x9d2c5680UL;
-  y ^= (y << 15) & 0xefc60000UL;
-  y ^= (y >> 18);
-
-  return y;
-}
-
-/* generates a random number on [0,1)-double-interval */
-static double __uniform__(THGenerator *_generator)
-{
-  /* divided by 2^32 */
-  return (double)THRandom_random(_generator) * (1.0/4294967296.0);
-}
-
-/*********************************************************
-
- Thanks *a lot* Takuji Nishimura and Makoto Matsumoto!
-
- Now my own code...
-
-*********************************************************/
-
-double THRandom_uniform(THGenerator *_generator, double a, double b)
-{
-  return(__uniform__(_generator) * (b - a) + a);
-}
-
-double THRandom_normal(THGenerator *_generator, double mean, double stdv)
-{
-  THArgCheck(stdv > 0, 2, "standard deviation must be strictly positive");
-
-  /* This is known as the Box-Muller method */
-  if(!_generator->normal_is_valid)
-  {
-    _generator->normal_x = __uniform__(_generator);
-    _generator->normal_y = __uniform__(_generator);
-    _generator->normal_rho = sqrt(-2. * log(1.0-_generator->normal_y));
-    _generator->normal_is_valid = 1;
-  }
-  else
-    _generator->normal_is_valid = 0;
-
-  if(_generator->normal_is_valid)
-    return _generator->normal_rho*cos(2.*M_PI*_generator->normal_x)*stdv+mean;
-  else
-    return _generator->normal_rho*sin(2.*M_PI*_generator->normal_x)*stdv+mean;
-}
-
-double THRandom_exponential(THGenerator *_generator, double lambda)
-{
-  return(-1. / lambda * log(1-__uniform__(_generator)));
-}
-
-double THRandom_cauchy(THGenerator *_generator, double median, double sigma)
-{
-  return(median + sigma * tan(M_PI*(__uniform__(_generator)-0.5)));
-}
-
-/* Faut etre malade pour utiliser ca.
-   M'enfin. */
-double THRandom_logNormal(THGenerator *_generator, double mean, double stdv)
-{
-  THArgCheck(stdv > 0, 2, "standard deviation must be strictly positive");
-  return(exp(THRandom_normal(_generator, mean, stdv)));
-}
-
-int THRandom_geometric(THGenerator *_generator, double p)
-{
-  THArgCheck(p > 0 && p < 1, 1, "must be > 0 and < 1");
-  return((int)(log(1-__uniform__(_generator)) / log(p)) + 1);
-}
-
-int THRandom_bernoulli(THGenerator *_generator, double p)
-{
-  THArgCheck(p >= 0 && p <= 1, 1, "must be >= 0 and <= 1");
-  return(__uniform__(_generator) <= p);
-}
diff --git a/contrib/lua-torch/torch7/lib/TH/THRandom.h b/contrib/lua-torch/torch7/lib/TH/THRandom.h
deleted file mode 100644
index 28a14c0d7..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THRandom.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#ifndef TH_RANDOM_INC
-#define TH_RANDOM_INC
-
-#include "THGeneral.h"
-
-#define _MERSENNE_STATE_N 624
-#define _MERSENNE_STATE_M 397
-/* A THGenerator contains all the state required for a single random number stream */
-typedef struct THGenerator {
-  /* The initial seed. */
-  unsigned long the_initial_seed;
-  int left;  /* = 1; */
-  int seeded; /* = 0; */
-  unsigned long next;
-  unsigned long state[_MERSENNE_STATE_N]; /* the array for the state vector  */
-  /********************************/
-
-  /* For normal distribution */
-  double normal_x;
-  double normal_y;
-  double normal_rho;
-  int normal_is_valid; /* = 0; */
-} THGenerator;
-
-#define torch_Generator "torch.Generator"
-
-/* Manipulate THGenerator objects */
-TH_API THGenerator * THGenerator_new(void);
-TH_API THGenerator * THGenerator_copy(THGenerator *self, THGenerator *from);
-TH_API void THGenerator_free(THGenerator *gen);
-
-/* Checks if given generator is valid */
-TH_API int THGenerator_isValid(THGenerator *_generator);
-
-/* Initializes the random number generator from /dev/urandom (or on Windows
-platforms with the current time (granularity: seconds)) and returns the seed. */
-TH_API unsigned long THRandom_seed(THGenerator *_generator);
-
-/* Initializes the random number generator with the given long "the_seed_". */
-TH_API void THRandom_manualSeed(THGenerator *_generator, unsigned long the_seed_);
-
-/* Returns the starting seed used. */
-TH_API unsigned long THRandom_initialSeed(THGenerator *_generator);
-
-/* Generates a uniform 32 bits integer. */
-TH_API unsigned long THRandom_random(THGenerator *_generator);
-
-/* Generates a uniform random number on [0,1[. */
-TH_API double THRandom_uniform(THGenerator *_generator, double a, double b);
-
-/** Generates a random number from a normal distribution.
-    (With mean #mean# and standard deviation #stdv >= 0#).
-*/
-TH_API double THRandom_normal(THGenerator *_generator, double mean, double stdv);
-
-/** Generates a random number from an exponential distribution.
-    The density is $p(x) = lambda * exp(-lambda * x)$, where
-    lambda is a positive number.
-*/
-TH_API double THRandom_exponential(THGenerator *_generator, double lambda);
-
-/** Returns a random number from a Cauchy distribution.
-    The Cauchy density is $p(x) = sigma/(pi*(sigma^2 + (x-median)^2))$
-*/
-TH_API double THRandom_cauchy(THGenerator *_generator, double median, double sigma);
-
-/** Generates a random number from a log-normal distribution.
-    (#mean > 0# is the mean of the log-normal distribution
-    and #stdv# is its standard deviation).
-*/
-TH_API double THRandom_logNormal(THGenerator *_generator, double mean, double stdv);
-
-/** Generates a random number from a geometric distribution.
-    It returns an integer #i#, where $p(i) = (1-p) * p^(i-1)$.
-    p must satisfy $0 < p < 1$.
-*/
-TH_API int THRandom_geometric(THGenerator *_generator, double p);
-
-/* Returns true with probability $p$ and false with probability $1-p$ (p > 0). */
-TH_API int THRandom_bernoulli(THGenerator *_generator, double p);
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THSize.c b/contrib/lua-torch/torch7/lib/TH/THSize.c
deleted file mode 100644
index ccf1f61dd..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THSize.c
+++ /dev/null
@@ -1,26 +0,0 @@
-#include "THSize.h"
-
-int THSize_isSameSizeAs(const long *sizeA, long dimsA, const long *sizeB, long dimsB) {
-  int d;
-  if (dimsA != dimsB)
-    return 0;
-  for(d = 0; d < dimsA; ++d)
-  {
-    if(sizeA[d] != sizeB[d])
-      return 0;
-  }
-  return 1;
-}
-
-ptrdiff_t THSize_nElement(long dims, long *size) {
-  if(dims == 0)
-    return 0;
-  else
-  {
-    ptrdiff_t nElement = 1;
-    int d;
-    for(d = 0; d < dims; d++)
-      nElement *= size[d];
-    return nElement;
-  }
-}
diff --git a/contrib/lua-torch/torch7/lib/TH/THSize.h b/contrib/lua-torch/torch7/lib/TH/THSize.h
deleted file mode 100644
index 3d39696f6..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THSize.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef TH_SIZE_INC
-#define TH_SIZE_INC
-
-#include "THGeneral.h"
-#include <stddef.h>
-
-// THTensor functions that would work on a THSize if we had such a class in C++,
-// i.e. THTensor functions that depend only on the shape of the tensor, not the type.
-
-TH_API int THSize_isSameSizeAs(const long *sizeA, long dimsA, const long *sizeB, long dimsB);
-TH_API ptrdiff_t THSize_nElement(long dims, long *size);
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THStorage.c b/contrib/lua-torch/torch7/lib/TH/THStorage.c
deleted file mode 100644
index f6b63f4a8..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THStorage.c
+++ /dev/null
@@ -1,153 +0,0 @@
-#include "THAtomic.h"
-#include "THStorage.h"
-
-#include "generic/THStorage.c"
-#include "THGenerateAllTypes.h"
-
-#include "generic/THStorage.c"
-#include "THGenerateHalfType.h"
-
-#include "generic/THStorageCopy.c"
-#include "THGenerateAllTypes.h"
-
-#include "generic/THStorageCopy.c"
-#include "THGenerateHalfType.h"
-
-
-THDescBuff THLongStorage_sizeDesc(const THLongStorage *size) {
-  return _THSizeDesc(size->data, size->size);
-}
-
-THLongStorage *THLongStorage_newInferSize(THLongStorage *size, ptrdiff_t nElement)
-{
-  ptrdiff_t total_size = (size->size > 0 ? 1 : 0);
-  ptrdiff_t dim_infer = -1;
-  ptrdiff_t i;
-  for (i = 0; i < size->size; i++) {
-    if (size->data[i] == -1) {
-      THArgCheck(dim_infer == -1, 1, "only one dimension can be inferred");
-      dim_infer = i;
-    } else {
-      total_size *= size->data[i];
-    }
-  }
-  if (dim_infer != -1) {
-    THDescBuff buf = THLongStorage_sizeDesc(size);
-    THArgCheck(total_size > 0 && nElement % total_size == 0, 2,
-        "size '%s' is invalid for input of with %td elements", buf.str, nElement);
-  } else {
-    THDescBuff buf = THLongStorage_sizeDesc(size);
-    THArgCheck(nElement == total_size, 2,
-        "size '%s' is invalid for input of with %td elements", buf.str, nElement);
-  }
-  THLongStorage* copy = THLongStorage_newWithSize(size->size);
-  THLongStorage_copy(copy, size);
-  if (dim_infer != -1) {
-    copy->data[dim_infer] = nElement / total_size;
-  }
-  return copy;
-}
-
-int THLongStorage_inferSize2(THLongStorage *output, long *sizesA, long dimsA, long *sizesB, long dimsB,
-                             char *error_buffer, int buffer_len) {
-  THArgCheck(sizesA != NULL, 1, "sizesA must not be null");
-  THArgCheck(sizesB != NULL, 2, "sizesB must not be null");
-  THArgCheck(dimsA, 1, "Can't expand empty tensor a");
-  THArgCheck(dimsB, 1, "Can't expand empty tensor b");
-  ptrdiff_t ndim = dimsA > dimsB ? dimsA : dimsB;
-
-  long *expandedSizes = THAlloc(sizeof(long)*ndim);
-
-  for (long i = ndim - 1; i >= 0; --i) {
-    long offset = ndim - 1 - i;
-    long dimA = dimsA - 1 - offset;
-    long dimB = dimsB - 1 - offset;
-    long sizeA = (dimA >= 0) ? sizesA[dimA] : 1;
-    long sizeB = (dimB >= 0) ? sizesB[dimB] : 1;
-    if (sizeA == sizeB || sizeA == 1 || sizeB == 1) {
-      expandedSizes[i] = THMax(sizeA, sizeB);
-    } else {
-      THFree(expandedSizes);
-      snprintf(error_buffer, buffer_len, "The size of tensor a (%ld) must match the size of tensor b (%ld) at "
-               "non-singleton dimension %ld.", sizeA, sizeB, i);
-      return -1;
-    }
-  }
-  THLongStorage_resize(output, ndim);
-  memcpy(THLongStorage_data(output), expandedSizes, sizeof(long)*ndim);
-  THFree(expandedSizes);
-  return 0;
-}
-
-int THLongStorage_inferSizeN(THLongStorage *output, int n, long **sizes, long *dims,
-                             char *error_buffer, int buffer_len) {
-  THArgCheck(n > 0, 2, "n must be greater than 0");
-  THArgCheck(sizes != NULL, 1, "sizes must not be null");
-  THArgCheck(dims != NULL, 1, "dims must not be null");
-
-  ptrdiff_t ndim = 0;
-  for (int j = 0; j < n; ++j) {
-    THArgCheck(sizes[ j ] != NULL, 1, "size %d must not be null", j);
-    THArgCheck(dims[ j ], 1, "Can't expand empty tensor %d", j);
-    ndim = dims[ j ] > ndim ? dims[ j ] : ndim;
-  }
-
-  long *expandedSizes = THAlloc(sizeof(long)*ndim);
-
-  for (long i = ndim - 1; i >= 0; --i) {
-    expandedSizes[ i ] = 1;
-    long offset = ndim - 1 - i;
-    for (int j  = 0; j < n; ++j) {
-      long dim = dims[ j ] - 1 - offset;
-      long size = (dim >= 0) ? sizes[ j ][ dim ] : 1;
-      if (size == expandedSizes[ i ] || size == 1 || expandedSizes[ i ] == 1) {
-        expandedSizes[ i ] =  THMax(expandedSizes[ i ], size);
-      } else {
-        THFree(expandedSizes);
-        snprintf(error_buffer, buffer_len, "The size of tensor %i (%ld) must match the expanded size"
-                 "of tensor (%ld) at non-singleton dimension %ld.", j, size, expandedSizes[ i ], i);
-        return -1;
-      }
-    }
-  }
-  THLongStorage_resize(output, ndim);
-  memcpy(THLongStorage_data(output), expandedSizes, sizeof(long)*ndim);
-  THFree(expandedSizes);
-  return 0;
-}
-
-int THLongStorage_inferExpandGeometry(long *tensorSizes, long *tensorStrides, long tensorDim,
-                                        THLongStorage *sizes, long **expandedSizes, long **expandedStrides,
-                                        char *error_buffer, int buffer_len) {
-  ptrdiff_t ndim = THLongStorage_size(sizes);
-
-  long *expandedSizesCalc = THAlloc(sizeof(long)*ndim);
-  long *expandedStridesCalc = THAlloc(sizeof(long)*ndim);
-
-  // create a new geometry for the tensors
-  for (long i = ndim - 1; i >= 0; --i) {
-    long offset = ndim - 1 - i;
-    long dim = tensorDim - 1 - offset;
-    long size = (dim >= 0) ? tensorSizes[dim] : 1;
-    long stride = (dim >= 0) ?
-        tensorStrides[dim] : expandedSizesCalc[i + 1] * expandedStridesCalc[i+1];
-    long targetSize = THLongStorage_data(sizes)[i];
-    if (size != targetSize) {
-      if (size == 1) {
-        size = targetSize;
-        stride = 0;
-      } else {
-        THFree(expandedSizesCalc);
-        THFree(expandedStridesCalc);
-        snprintf(error_buffer, buffer_len, "The expanded size of the tensor (%ld) must match the existing size (%ld) at "
-                 "non-singleton dimension %ld.", targetSize, size, i);
-        return -1;
-      }
-    }
-    expandedSizesCalc[i] = size;
-    expandedStridesCalc[i] = stride;
-  }
-  *expandedSizes = expandedSizesCalc;
-  *expandedStrides = expandedStridesCalc;
-  return 0;
-}
diff --git a/contrib/lua-torch/torch7/lib/TH/THStorage.h b/contrib/lua-torch/torch7/lib/TH/THStorage.h
deleted file mode 100644
index fb7946bd9..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THStorage.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef TH_STORAGE_INC
-#define TH_STORAGE_INC
-
-#include "THGeneral.h"
-#include "THAllocator.h"
-
-#define THStorage        TH_CONCAT_3(TH,Real,Storage)
-#define THStorage_(NAME) TH_CONCAT_4(TH,Real,Storage_,NAME)
-
-/* fast access methods */
-#define TH_STORAGE_GET(storage, idx) ((storage)->data[(idx)])
-#define TH_STORAGE_SET(storage, idx, value) ((storage)->data[(idx)] = (value))
-
-#include "generic/THStorage.h"
-#include "THGenerateAllTypes.h"
-
-#include "generic/THStorage.h"
-#include "THGenerateHalfType.h"
-
-#include "generic/THStorageCopy.h"
-#include "THGenerateAllTypes.h"
-
-#include "generic/THStorageCopy.h"
-#include "THGenerateHalfType.h"
-
-TH_API THDescBuff THLongStorage_sizeDesc(const THLongStorage *size);
-TH_API THLongStorage *THLongStorage_newInferSize(THLongStorage *size, ptrdiff_t nElement);
-
-// Given the sizes of {2,N} tensors, write out the size when the tensors are expanded together.
-TH_API int THLongStorage_inferSize2(THLongStorage *output, long *sizesA, long dimsA,
-                                    long *sizesB, long dimsB, char *error_buffer, int buffer_len);
-TH_API int THLongStorage_inferSizeN(THLongStorage *output, int n, long **sizes, long *dims,
-                                    char *error_buffer, int buffer_len);
-
-TH_API int THLongStorage_inferExpandGeometry(long *tensorSizes, long *tensorStrides, long tensorDim,
-                                             THLongStorage *sizes, long **expandedSizes, long **expandedStrides,
-                                             char *error_buffer, int buffer_len);
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THTensor.c b/contrib/lua-torch/torch7/lib/TH/THTensor.c
deleted file mode 100644
index 115e396a1..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THTensor.c
+++ /dev/null
@@ -1,34 +0,0 @@
-#include "THAtomic.h"
-#include "THTensor.h"
-#include "THVector.h"
-#include "generic/simd/simd.h"
-
-#include "THBlas.h"
-#include "THLapack.h"
-#include "THRandom.h"
-#include "THTensorDimApply.h"
-#include "THMath.h"
-
-#include "generic/THTensor.c"
-#include "THGenerateAllTypes.h"
-
-#include "generic/THTensor.c"
-#include "THGenerateHalfType.h"
-
-#include "generic/THTensorCopy.c"
-#include "THGenerateAllTypes.h"
-
-#include "generic/THTensorCopy.c"
-#include "THGenerateHalfType.h"
-
-#include "generic/THTensorRandom.c"
-#include "THGenerateAllTypes.h"
-
-#include "generic/THTensorMath.c"
-#include "THGenerateAllTypes.h"
-
-#include "generic/THTensorConv.c"
-#include "THGenerateAllTypes.h"
-
-#include "generic/THTensorLapack.c"
-#include "THGenerateFloatTypes.h"
diff --git a/contrib/lua-torch/torch7/lib/TH/THTensor.h b/contrib/lua-torch/torch7/lib/TH/THTensor.h
deleted file mode 100644
index d2a1c57e8..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THTensor.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef TH_TENSOR_INC
-#define TH_TENSOR_INC
-
-#include "THStorage.h"
-#include "THTensorApply.h"
-
-#define THTensor          TH_CONCAT_3(TH,Real,Tensor)
-#define THTensor_(NAME)   TH_CONCAT_4(TH,Real,Tensor_,NAME)
-
-/* basics */
-#include "generic/THTensor.h"
-#include "THGenerateAllTypes.h"
-
-#include "generic/THTensor.h"
-#include "THGenerateHalfType.h"
-
-#include "generic/THTensorCopy.h"
-#include "THGenerateAllTypes.h"
-
-#include "generic/THTensorCopy.h"
-#include "THGenerateHalfType.h"
-
-#include "THTensorMacros.h"
-
-/* random numbers */
-#include "THRandom.h"
-#include "generic/THTensorRandom.h"
-#include "THGenerateAllTypes.h"
-
-/* maths */
-#include "generic/THTensorMath.h"
-#include "THGenerateAllTypes.h"
-
-/* convolutions */
-#include "generic/THTensorConv.h"
-#include "THGenerateAllTypes.h"
-
-/* lapack support */
-#include "generic/THTensorLapack.h"
-#include "THGenerateFloatTypes.h"
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THTensorApply.h b/contrib/lua-torch/torch7/lib/TH/THTensorApply.h
deleted file mode 100644
index 7f48da47e..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THTensorApply.h
+++ /dev/null
@@ -1,238 +0,0 @@
-#ifndef TH_TENSOR_APPLY_INC
-#define TH_TENSOR_APPLY_INC
-
-/*
- * The basic strategy for apply is as follows:
- *
- * 1. Starting with the outermost index, loop until we reach a dimension where the
- * data is no longer contiguous, i.e. the stride at that dimension is not equal to
- * the size of the tensor defined by the outer dimensions. Let's call this outer
- * (contiguous) tensor A. Note that if the Tensor is contiguous, then A is equal
- * to the entire Tensor. Let's call the inner tensor B.
- *
- * 2. We loop through the indices in B, starting at its outermost dimension. For
- * example, if B is a 2x2 matrix, then we do:
- *
- * B[0][0]
- * B[0][1]
- * B[1][0]
- * B[1][1]
- *
- * We set the offset into the underlying storage as (storageOffset + stride_B * index_B),
- * i.e. basically we compute the offset into the storage as we would normally for a
- * Tensor. But because we are guaranteed the subsequent data is contiguous in memory, we
- * can simply loop for sizeof(A) iterations and perform the operation, without having to
- * follow the order described by the strides of A.
- *
- * 3. As an optimization, we merge dimensions of A that are contiguous in memory. For
- * example, if A is a 3x3x3x3 tensor narrowed from a 3x3x4x3 tensor, then the first two
- * dimensions can be merged for the purposes of APPLY, reducing the number of nested
- * loops.
- */
-
-#define __TH_TENSOR_APPLYX_PREAMBLE(TYPE, TENSOR, DIM, ALLOW_CONTIGUOUS) \
-  TYPE *TENSOR##_data = NULL; \
-  long *TENSOR##_counter = NULL, *TENSOR##_sizes = NULL, *TENSOR##_strides = NULL, *TENSOR##_dimOffset = NULL; \
-  long TENSOR##_stride = 0, TENSOR##_size = 0, TENSOR##_dim = 0, TENSOR##_i, TENSOR##_n; \
-  int TENSOR##_contiguous = ALLOW_CONTIGUOUS && DIM < 0; \
-  TENSOR##_n = (TENSOR->nDimension ? 1 : 0); \
-  for(TENSOR##_i = 0; TENSOR##_i < TENSOR->nDimension; TENSOR##_i++) \
-    TENSOR##_n *= TENSOR->size[TENSOR##_i]; \
-\
-  if(TENSOR->nDimension == 0) \
-    TH_TENSOR_APPLY_hasFinished = 1; \
-  else \
-  { \
-    TENSOR##_data = TENSOR->storage->data+TENSOR->storageOffset; \
-    TENSOR##_size = 1; \
-    TENSOR##_stride = 1; \
-    for(TENSOR##_i = TENSOR->nDimension-1; TENSOR##_i >= 0; TENSOR##_i--) { \
-      if(TENSOR->size[TENSOR##_i] != 1) { \
-        if(TENSOR->stride[TENSOR##_i] == TENSOR##_size && TENSOR##_i != DIM) \
-          TENSOR##_size *= TENSOR->size[TENSOR##_i]; \
-        else{ \
-          TENSOR##_contiguous = 0; \
-          break; \
-        } \
-      } \
-    } \
-    if (!TENSOR##_contiguous) { \
-      /* Find the dimension of contiguous sections */ \
-      TENSOR##_dim = 1; \
-      for(TENSOR##_i = TENSOR->nDimension-2; TENSOR##_i >= 0; TENSOR##_i--) \
-      { \
-        if(TENSOR->stride[TENSOR##_i] != TENSOR->stride[TENSOR##_i+1] * TENSOR->size[TENSOR##_i+1] || TENSOR##_i == DIM || TENSOR##_i+1 == DIM) \
-          TENSOR##_dim++; \
-      } \
-      /* Allocate an array of 3*dim elements, where dim is the number of contiguous sections */ \
-      TENSOR##_counter = (long*)THAlloc(sizeof(long)*(3*TENSOR##_dim)); \
-      TENSOR##_sizes = TENSOR##_counter + TENSOR##_dim; \
-      TENSOR##_strides = TENSOR##_counter + 2*TENSOR##_dim; \
-      TH_TENSOR_dim_index = TENSOR##_dim-1; \
-      TENSOR##_dimOffset = (DIM == TENSOR->nDimension-1) ? &TENSOR##_i : &TENSOR##_counter[DIM]; \
-      TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size[TENSOR->nDimension-1]; \
-      TENSOR##_strides[TH_TENSOR_dim_index] = TENSOR->stride[TENSOR->nDimension-1]; \
-      /* TENSOR##_counter tracks where we are in the storage. The offset into the */ \
-      /* storage is given by storage_offset + (i * j), where i is the stride */ \
-      /* vector and j is tensor_counter vector. This sets the starting position for the loop. */ \
-      for(TENSOR##_i = TENSOR##_dim-1; TENSOR##_i >= 0; --TENSOR##_i) { \
-        TENSOR##_counter[TENSOR##_i] = 0; \
-      } \
-      for(TENSOR##_i = TENSOR->nDimension-2; TENSOR##_i >= 0; --TENSOR##_i) { \
-        if (TENSOR->stride[TENSOR##_i] == TENSOR->stride[TENSOR##_i+1] * TENSOR->size[TENSOR##_i+1] && TENSOR##_i != DIM && TENSOR##_i+1 != DIM) { \
-          TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size[TENSOR##_i] * TENSOR##_sizes[TH_TENSOR_dim_index]; \
-          if (DIM != TENSOR->nDimension-1 && TENSOR##_i < DIM) \
-            TENSOR##_dimOffset--; \
-        } else { \
-          --TH_TENSOR_dim_index; \
-          TENSOR##_sizes[TH_TENSOR_dim_index] = TENSOR->size[TENSOR##_i]; \
-          TENSOR##_strides[TH_TENSOR_dim_index] = TENSOR->stride[TENSOR##_i]; \
-        } \
-      } \
-      /* Size of the inner most section */ \
-      TENSOR##_size = TENSOR##_sizes[TENSOR##_dim-1]; \
-      /* Stride of the inner most section */ \
-      TENSOR##_stride = TENSOR##_strides[TENSOR##_dim-1]; \
-    } \
-  } \
-  TENSOR##_i = 0;
-
-#define  __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR, ALWAYS_UPDATE) \
-  if(TENSOR##_i == TENSOR##_size || ALWAYS_UPDATE) \
-  { \
-    if(TENSOR##_contiguous) \
-      break; \
-\
-    if(TENSOR##_dim == 1) \
-       break; \
-\
-    /* Reset pointer to beginning of loop */ \
-    TENSOR##_data -= TENSOR##_size*TENSOR##_stride; \
-    for(TENSOR##_i = TENSOR##_dim-2; TENSOR##_i >= 0; TENSOR##_i--) \
-    { \
-      TENSOR##_counter[TENSOR##_i]++; \
-      /* Jump ahread by the stride of this dimension */ \
-      TENSOR##_data += TENSOR##_strides[TENSOR##_i]; \
-\
-      if(TENSOR##_counter[TENSOR##_i]  == TENSOR##_sizes[TENSOR##_i]) \
-      { \
-        if(TENSOR##_i == 0) \
-        { \
-          TH_TENSOR_APPLY_hasFinished = 1; \
-          break; \
-        } \
-          else \
-        { \
-          /* Reset the pointer to the beginning of the chunk defined by this dimension */ \
-          TENSOR##_data -= TENSOR##_counter[TENSOR##_i]*TENSOR##_strides[TENSOR##_i]; \
-          TENSOR##_counter[TENSOR##_i] = 0; \
-        } \
-      } \
-      else \
-        break; \
-    } \
-    TENSOR##_i = 0; \
-  } \
-
-#define TH_TENSOR_APPLY3_D(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, DIM, CODE) \
-{ \
-  int TH_TENSOR_APPLY_hasFinished = 0; \
-  long TH_TENSOR_dim_index = 0; \
-  __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, DIM, 1) \
-  __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, DIM, 1) \
-  __TH_TENSOR_APPLYX_PREAMBLE(TYPE3, TENSOR3, DIM, 1) \
-                                                                        \
-  int elements_equal = 1;                                               \
-  if(TENSOR1##_n != TENSOR2##_n) {                                      \
-    elements_equal = 0;                                                 \
-  }                                                                     \
-  else if(TENSOR1##_n != TENSOR3##_n) {                                 \
-    elements_equal = 0;                                                 \
-  }                                                                     \
-  if (elements_equal == 0) {                                            \
-    THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->nDimension); \
-    THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->nDimension); \
-    THDescBuff T3buff = _THSizeDesc(TENSOR3->size, TENSOR3->nDimension); \
-    THError("inconsistent tensor size, expected %s %s, %s %s and %s %s to have the same " \
-            "number of elements, but got %d, %d and %d elements respectively", \
-            #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, #TENSOR3, T3buff.str, \
-            TENSOR1##_n, TENSOR2##_n, TENSOR3##_n);                     \
-  }                                                                     \
-                                                                        \
-  while(!TH_TENSOR_APPLY_hasFinished) \
-  { \
-    /* Loop through the inner most region of the Tensor */ \
-    for(; TENSOR1##_i < TENSOR1##_size && TENSOR2##_i < TENSOR2##_size && TENSOR3##_i < TENSOR3##_size; TENSOR1##_i++, TENSOR2##_i++, TENSOR3##_i++, TENSOR1##_data += TENSOR1##_stride, TENSOR2##_data += TENSOR2##_stride, TENSOR3##_data += TENSOR3##_stride) /* 0 et pas TENSOR##_dim! */ \
-    { \
-      CODE \
-    } \
-    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR1, 0) \
-    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR2, 0) \
-    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR3, 0) \
-  } \
-  if(TENSOR1##_counter != NULL) \
-    THFree(TENSOR1##_counter); \
-  if(TENSOR2##_counter != NULL) \
-    THFree(TENSOR2##_counter); \
-  if(TENSOR3##_counter != NULL) \
-    THFree(TENSOR3##_counter); \
-}
-
-#define TH_TENSOR_APPLY3(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE) \
-  TH_TENSOR_APPLY3_D(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, -1, CODE)
-
-#define TH_TENSOR_APPLY2_D(TYPE1, TENSOR1, TYPE2, TENSOR2, DIM, CODE) \
-{ \
-  int TH_TENSOR_APPLY_hasFinished = 0; \
-  long TH_TENSOR_dim_index = 0; \
-  __TH_TENSOR_APPLYX_PREAMBLE(TYPE1, TENSOR1, DIM, 1) \
-  __TH_TENSOR_APPLYX_PREAMBLE(TYPE2, TENSOR2, DIM, 1) \
-\
-    if(TENSOR1##_n != TENSOR2##_n) {                                    \
-      THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->nDimension); \
-      THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->nDimension); \
-      THError("inconsistent tensor size, expected %s %s and %s %s to have the same " \
-              "number of elements, but got %d and %d elements respectively", \
-              #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, TENSOR1##_n, TENSOR2##_n); \
-    }                                                                   \
-  while(!TH_TENSOR_APPLY_hasFinished) \
-  { \
-    /* Loop through the inner most region of the Tensor */ \
-    for(; TENSOR1##_i < TENSOR1##_size && TENSOR2##_i < TENSOR2##_size; TENSOR1##_i++, TENSOR2##_i++, TENSOR1##_data += TENSOR1##_stride, TENSOR2##_data += TENSOR2##_stride) /* 0 et pas TENSOR##_dim! */ \
-    { \
-      CODE \
-    } \
-    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR1, 0) \
-    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR2, 0) \
-  } \
-  if(TENSOR1##_counter != NULL) \
-    THFree(TENSOR1##_counter); \
-  if(TENSOR2##_counter != NULL) \
-    THFree(TENSOR2##_counter); \
-}
-
-#define TH_TENSOR_APPLY2(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \
-  TH_TENSOR_APPLY2_D(TYPE1, TENSOR1, TYPE2, TENSOR2, -1, CODE)
-
-#define TH_TENSOR_APPLY_D(TYPE, TENSOR, DIM, CODE) \
-{ \
-  int TH_TENSOR_APPLY_hasFinished = 0; \
-  long TH_TENSOR_dim_index = 0; \
-  __TH_TENSOR_APPLYX_PREAMBLE(TYPE, TENSOR, DIM, 0) \
-\
-  while(!TH_TENSOR_APPLY_hasFinished) \
-  { \
-    /* Loop through the inner most region of the Tensor */ \
-    for(; TENSOR##_i < TENSOR##_size; TENSOR##_i++, TENSOR##_data += TENSOR##_stride) /* 0 et pas TENSOR##_dim! */ \
-    { \
-      CODE \
-    } \
-    __TH_TENSOR_APPLYX_UPDATE_COUNTERS(TENSOR, 1) \
-  } \
-  THFree(TENSOR##_counter); \
-}
-
-#define TH_TENSOR_APPLY(TYPE, TENSOR, CODE) \
-  TH_TENSOR_APPLY_D(TYPE, TENSOR, -1, CODE)
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THTensorDimApply.h b/contrib/lua-torch/torch7/lib/TH/THTensorDimApply.h
deleted file mode 100644
index 6727e1f7f..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THTensorDimApply.h
+++ /dev/null
@@ -1,324 +0,0 @@
-#ifndef TH_TENSOR_DIM_APPLY_INC
-#define TH_TENSOR_DIM_APPLY_INC
-
-#define TH_TENSOR_DIM_APPLY3(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, DIMENSION, CODE) \
-{ \
-  TYPE1 *TENSOR1##_data = NULL; \
-  long TENSOR1##_stride = 0, TENSOR1##_size = 0; \
-  TYPE2 *TENSOR2##_data = NULL; \
-  long TENSOR2##_stride = 0, TENSOR2##_size = 0; \
-  TYPE3 *TENSOR3##_data = NULL; \
-  long TENSOR3##_stride = 0, TENSOR3##_size = 0; \
-  long *TH_TENSOR_DIM_APPLY_counter = NULL; \
-  int TH_TENSOR_DIM_APPLY_hasFinished = 0; \
-  int TH_TENSOR_DIM_APPLY_i; \
-\
-  if( (DIMENSION < 0) || (DIMENSION >= TENSOR1->nDimension) ) \
-    THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, TENSOR1->nDimension); \
-  int same_dims = 1;                                                    \
-  if( TENSOR1->nDimension != TENSOR2->nDimension ) {                    \
-    same_dims = 0;                                                      \
-  } \
-  if( TENSOR1->nDimension != TENSOR3->nDimension ) { \
-    same_dims = 0;                                   \
-  } \
-  if (same_dims == 0) { \
-    THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->nDimension); \
-    THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->nDimension); \
-    THDescBuff T3buff = _THSizeDesc(TENSOR3->size, TENSOR3->nDimension); \
-    THError("inconsistent tensor size, expected %s %s, %s %s and %s %s to have the same " \
-            "number of dimensions", #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, #TENSOR3, T3buff.str); \
-  }                                                                     \
-  int shape_check_flag = 0;                                             \
-  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->nDimension; TH_TENSOR_DIM_APPLY_i++) \
-  { \
-    if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
-      continue; \
-    if(TENSOR1->size[TH_TENSOR_DIM_APPLY_i] != TENSOR2->size[TH_TENSOR_DIM_APPLY_i]) \
-      shape_check_flag = 1;                                             \
-    if(TENSOR1->size[TH_TENSOR_DIM_APPLY_i] != TENSOR3->size[TH_TENSOR_DIM_APPLY_i]) \
-      shape_check_flag = 1;                                             \
-  } \
-    \
-  if (shape_check_flag == 1) { \
-    THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->nDimension); \
-    THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->nDimension); \
-    THDescBuff T3buff = _THSizeDesc(TENSOR3->size, TENSOR3->nDimension); \
-    THError("Expected %s %s, %s %s and %s %s to have the same size in dimension %d", \
-            #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, #TENSOR3, T3buff.str, DIMENSION); \
-  } \
-\
-  TH_TENSOR_DIM_APPLY_counter = (long*)THAlloc(sizeof(long)*(TENSOR1->nDimension)); \
-  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->nDimension; TH_TENSOR_DIM_APPLY_i++) \
-    TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
-\
-  TENSOR1##_data = (TENSOR1)->storage->data+(TENSOR1)->storageOffset; \
-  TENSOR1##_stride = (TENSOR1)->stride[DIMENSION]; \
-  TENSOR1##_size = TENSOR1->size[DIMENSION]; \
-\
-  TENSOR2##_data = (TENSOR2)->storage->data+(TENSOR2)->storageOffset; \
-  TENSOR2##_stride = (TENSOR2)->stride[DIMENSION]; \
-  TENSOR2##_size = TENSOR2->size[DIMENSION]; \
-\
-  TENSOR3##_data = (TENSOR3)->storage->data+(TENSOR3)->storageOffset; \
-  TENSOR3##_stride = (TENSOR3)->stride[DIMENSION]; \
-  TENSOR3##_size = TENSOR3->size[DIMENSION]; \
-\
-  while(!TH_TENSOR_DIM_APPLY_hasFinished) \
-  { \
-    CODE \
-\
-    if(TENSOR1->nDimension == 1) \
-       break; \
- \
-    for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->nDimension; TH_TENSOR_DIM_APPLY_i++) \
-    { \
-      if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
-      { \
-        if(TH_TENSOR_DIM_APPLY_i == TENSOR1->nDimension-1) \
-        { \
-          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
-          break; \
-        } \
-        continue; \
-      } \
-\
-      TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]++; \
-      TENSOR1##_data += TENSOR1->stride[TH_TENSOR_DIM_APPLY_i]; \
-      TENSOR2##_data += TENSOR2->stride[TH_TENSOR_DIM_APPLY_i]; \
-      TENSOR3##_data += TENSOR3->stride[TH_TENSOR_DIM_APPLY_i]; \
-\
-      if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR1->size[TH_TENSOR_DIM_APPLY_i]) \
-      { \
-        if(TH_TENSOR_DIM_APPLY_i == TENSOR1->nDimension-1) \
-        { \
-          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
-          break; \
-        } \
-        else \
-        { \
-          TENSOR1##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR1->stride[TH_TENSOR_DIM_APPLY_i]; \
-          TENSOR2##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR2->stride[TH_TENSOR_DIM_APPLY_i]; \
-          TENSOR3##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR3->stride[TH_TENSOR_DIM_APPLY_i]; \
-          TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
-        } \
-      } \
-      else \
-        break; \
-    } \
-  } \
-  THFree(TH_TENSOR_DIM_APPLY_counter); \
-}
-
-/**
- * Similar to DIM_APPLY(...) but we maintain two sets of pointers: one for the first tensor
- * and one for the second. The two tensors must have the same shape, other than at the
- * specified DIMENSION. This function makes it easy to store the output from reducing the
- * TENSOR at index. For example, in the sum example described below, we could instead do:
- *
- * long i = 0;
- * TYPE1 sum;
- *
- * for (i = 0; i < TENSOR1##_size; ++i) {
- *   sum += TENSOR1##_data[i * TENSOR1##_stride]
- * }
- * *TENSOR2##_data = (TYPE2) sum;
- *
- * In particular, we guarantee that the offset into TENSOR2 will be what you would get if
- * you applied all of the index values used to generate the offset into TENSOR1.
- */
-#define TH_TENSOR_DIM_APPLY2(TYPE1, TENSOR1, TYPE2, TENSOR2, DIMENSION, CODE) \
-{ \
-  TYPE1 *TENSOR1##_data = NULL; \
-  long TENSOR1##_stride = 0, TENSOR1##_size = 0; \
-  TYPE2 *TENSOR2##_data = NULL; \
-  long TENSOR2##_stride = 0, TENSOR2##_size = 0; \
-  long *TH_TENSOR_DIM_APPLY_counter = NULL; \
-  int TH_TENSOR_DIM_APPLY_hasFinished = 0; \
-  int TH_TENSOR_DIM_APPLY_i; \
-\
-  if( (DIMENSION < 0) || (DIMENSION >= TENSOR1->nDimension) ) \
-    THError("invalid dimension %d (expected to be 0 <= dim < %d)", DIMENSION, TENSOR1->nDimension); \
-  if( TENSOR1->nDimension != TENSOR2->nDimension ) {                    \
-    THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->nDimension); \
-    THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->nDimension); \
-    THError("inconsistent tensor size, expected %s %s and %s %s to have the same " \
-            "number of dimensions", #TENSOR1, T1buff.str, #TENSOR2, T2buff.str);        \
-  }                                                                     \
-  int shape_check_flag = 0;                                             \
-  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->nDimension; TH_TENSOR_DIM_APPLY_i++) \
-  { \
-    if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
-      continue; \
-    if(TENSOR1->size[TH_TENSOR_DIM_APPLY_i] != TENSOR2->size[TH_TENSOR_DIM_APPLY_i]) { \
-      THDescBuff T1buff = _THSizeDesc(TENSOR1->size, TENSOR1->nDimension); \
-      THDescBuff T2buff = _THSizeDesc(TENSOR2->size, TENSOR2->nDimension); \
-      THError("Expected %s %s and %s %s to have the same size in dimension %d", \
-              #TENSOR1, T1buff.str, #TENSOR2, T2buff.str, DIMENSION);   \
-    }                                                                   \
-  } \
-\
-  TH_TENSOR_DIM_APPLY_counter = (long*)THAlloc(sizeof(long)*(TENSOR1->nDimension)); \
-  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->nDimension; TH_TENSOR_DIM_APPLY_i++) \
-    TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
-\
-  TENSOR1##_data = (TENSOR1)->storage->data+(TENSOR1)->storageOffset; \
-  TENSOR1##_stride = (TENSOR1)->stride[DIMENSION]; \
-  TENSOR1##_size = TENSOR1->size[DIMENSION]; \
-\
-  TENSOR2##_data = (TENSOR2)->storage->data+(TENSOR2)->storageOffset; \
-  TENSOR2##_stride = (TENSOR2)->stride[DIMENSION]; \
-  TENSOR2##_size = TENSOR2->size[DIMENSION]; \
-\
-  while(!TH_TENSOR_DIM_APPLY_hasFinished) \
-  { \
-    CODE \
-\
-    if(TENSOR1->nDimension == 1) \
-       break; \
- \
-    for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR1->nDimension; TH_TENSOR_DIM_APPLY_i++) \
-    { \
-      if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
-      { \
-        if(TH_TENSOR_DIM_APPLY_i == TENSOR1->nDimension-1) \
-        { \
-          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
-          break; \
-        } \
-        continue; \
-      } \
-\
-      TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]++; \
-      TENSOR1##_data += TENSOR1->stride[TH_TENSOR_DIM_APPLY_i]; \
-      TENSOR2##_data += TENSOR2->stride[TH_TENSOR_DIM_APPLY_i]; \
-\
-      if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR1->size[TH_TENSOR_DIM_APPLY_i]) \
-      { \
-        if(TH_TENSOR_DIM_APPLY_i == TENSOR1->nDimension-1) \
-        { \
-          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
-          break; \
-        } \
-        else \
-        { \
-          TENSOR1##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR1->stride[TH_TENSOR_DIM_APPLY_i]; \
-          TENSOR2##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR2->stride[TH_TENSOR_DIM_APPLY_i]; \
-          TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
-        } \
-      } \
-      else \
-        break; \
-    } \
-  } \
-  THFree(TH_TENSOR_DIM_APPLY_counter); \
-}
-
-/**
- * The basic idea for DIM_APPLY: Given a TENSOR and a DIMENSION, provide access to the data stored
- * at all sets of dimension values other than DIMENSION, such that we can get all the values at those
- * fixed indices for the various values at DIMENSION.
- *
- * Suppose we have a 2x3x4 Tensor A, and we have DIMENSION=2. Then we will hit CODE (2x3) times, and the
- * pointer into storage will be at:
- *
- * A[0][0]
- * A[0][1]
- * A[0][2]
- * A[1][0]
- * A[1][1]
- * A[1][2]
- *
- * And at each point, we can access the data for each of the four elements of the Tensor via
- * TENSOR##_stride. So for example, if we wanted to sum the elements there, we could do:
- *
- * long i = 0;
- * TYPE sum;
- * for (i = 0; i < TENSOR##_size; i++) {
- *  sum += TENSOR##_data[i * TENSOR##_stride]
- * }
- *
- * Note that we don't have to have DIMENSION be the last tensor. If we have DIMENSION=1, then we will hit the
- * code (2x4) times, with pointer into the storage at:
- *
- * offset +
- *   stride_0 * 0 + stride_2 * 0
- *   stride_0 * 1 + stride_2 * 0
- *   stride_0 * 0 + stride_2 * 1
- *   stride_0 * 1 + stride_2 * 1
- *   stride_0 * 0 + stride_2 * 2
- *   stride_0 * 1 + stride_2 * 2
- *   stride_0 * 0 + stride_2 * 3
- *   stride_0 * 1 + stride_2 * 3
- *
- * So we can again sum over the values at DIMENSION with the other indices fixed.
- */
-#define TH_TENSOR_DIM_APPLY(TYPE, TENSOR, DIMENSION, CODE) \
-{ \
-  TYPE *TENSOR##_data = NULL; \
-  long TENSOR##_stride = 0, TENSOR##_size = 0; \
-  long *TH_TENSOR_DIM_APPLY_counter = NULL; \
-  int TH_TENSOR_DIM_APPLY_hasFinished = 0; \
-  int TH_TENSOR_DIM_APPLY_i; \
-\
-  if( (DIMENSION < 0) || (DIMENSION >= TENSOR->nDimension) ) \
-    THError("invalid dimension"); \
-\
-  TENSOR##_data = (TENSOR)->storage->data+(TENSOR)->storageOffset; \
-  TENSOR##_stride = (TENSOR)->stride[DIMENSION]; \
-  TENSOR##_size = TENSOR->size[DIMENSION]; \
-  /* Counter stores the indices into the Tensor at any time */ \
-  TH_TENSOR_DIM_APPLY_counter = (long*)THAlloc(sizeof(long)*(TENSOR->nDimension)); \
-  for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR->nDimension; TH_TENSOR_DIM_APPLY_i++) \
-    TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
-\
-  while(!TH_TENSOR_DIM_APPLY_hasFinished) \
-  { \
-    CODE \
-\
-    if(TENSOR->nDimension == 1) \
-       break; \
- \
-    for(TH_TENSOR_DIM_APPLY_i = 0; TH_TENSOR_DIM_APPLY_i < TENSOR->nDimension; TH_TENSOR_DIM_APPLY_i++) \
-    { \
-       /* Check if the index is equal to DIMENSION. We don't need to update the */ \
-       /* offset if this is the case, and can consider the next index. However, */ \
-       /* in the case that the DIMENSION is the last index in the Tensor, then */ \
-       /* we have parsed the entire tensor and can exit */ \
-      if(TH_TENSOR_DIM_APPLY_i == DIMENSION) \
-      { \
-        if(TH_TENSOR_DIM_APPLY_i == TENSOR->nDimension-1) \
-        { \
-          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
-          break; \
-        } \
-        continue; \
-      } \
-\
-      /* Bump the counter at this index, update the pointer */ \
-      TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]++; \
-      TENSOR##_data += TENSOR->stride[TH_TENSOR_DIM_APPLY_i]; \
-\
-      if(TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] == TENSOR->size[TH_TENSOR_DIM_APPLY_i]) \
-      { \
-        /* Handled TENSOR_size(dim) iterations for DIM_APPLY_i. If this is the last dimension, exit */ \
-        if(TH_TENSOR_DIM_APPLY_i == TENSOR->nDimension-1) \
-        { \
-          TH_TENSOR_DIM_APPLY_hasFinished = 1; \
-          break; \
-        } \
-        else \
-        { \
-          /* Reset the counter, and the pointer to the beginning of the storage for this combination of indices */ \
-          TENSOR##_data -= TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i]*TENSOR->stride[TH_TENSOR_DIM_APPLY_i]; \
-          TH_TENSOR_DIM_APPLY_counter[TH_TENSOR_DIM_APPLY_i] = 0; \
-        } \
-      } \
-      else \
-        break; \
-    } \
-  } \
-  THFree(TH_TENSOR_DIM_APPLY_counter); \
-}
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THTensorMacros.h b/contrib/lua-torch/torch7/lib/TH/THTensorMacros.h
deleted file mode 100644
index 15b67665e..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THTensorMacros.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef TH_TENSOR_MACROS_INC
-#define TH_TENSOR_MACROS_INC
-
-/* fast method to access to tensor data */
-
-#define THTensor_fastGet1d(self, x0)                                    \
-  (((self)->storage->data+(self)->storageOffset)[(x0)*(self)->stride[0]])
-
-#define THTensor_fastGet2d(self, x0, x1)                                \
-  (((self)->storage->data+(self)->storageOffset)[(x0)*(self)->stride[0]+(x1)*(self)->stride[1]])
-
-#define THTensor_fastGet3d(self, x0, x1, x2)                            \
-  (((self)->storage->data+(self)->storageOffset)[(x0)*(self)->stride[0]+(x1)*(self)->stride[1]+(x2)*(self)->stride[2]])
-
-#define THTensor_fastGet4d(self, x0, x1, x2, x3)                        \
-  (((self)->storage->data+(self)->storageOffset)[(x0)*(self)->stride[0]+(x1)*(self)->stride[1]+(x2)*(self)->stride[2]+(x3)*(self)->stride[3]])
-
-#define THTensor_fastSet1d(self, x0, value)                             \
-  (((self)->storage->data+(self)->storageOffset)[(x0)*(self)->stride[0]] = value)
-
-#define THTensor_fastSet2d(self, x0, x1, value)                         \
-  (((self)->storage->data+(self)->storageOffset)[(x0)*(self)->stride[0]+(x1)*(self)->stride[1]] = value)
-
-#define THTensor_fastSet3d(self, x0, x1, x2, value)                     \
-  (((self)->storage->data+(self)->storageOffset)[(x0)*(self)->stride[0]+(x1)*(self)->stride[1]+(x2)*(self)->stride[2]] = value)
-
-#define THTensor_fastSet4d(self, x0, x1, x2, x3, value)                 \
-  (((self)->storage->data+(self)->storageOffset)[(x0)*(self)->stride[0]+(x1)*(self)->stride[1]+(x2)*(self)->stride[2]+(x3)*(self)->stride[3]] = value)
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/THVector.c b/contrib/lua-torch/torch7/lib/TH/THVector.c
deleted file mode 100644
index 441057884..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THVector.c
+++ /dev/null
@@ -1,30 +0,0 @@
-#include "THVector.h"
-
-#include "generic/simd/simd.h"
-
-#ifdef __NEON__
-#include "vector/NEON.c"
-#endif
-
-#ifdef __PPC64__
-#include "vector/VSX.c"
-#endif
-
-#if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
-        || defined(USE_SSE4_1) || defined(USE_SSE4_2)
-#include "vector/SSE.c"
-#endif
-
-#if defined(USE_AVX)
-#include "vector/AVX.h"
-#endif
-
-#if defined(USE_AVX2)
-#include "vector/AVX2.h"
-#endif
-
-#include "generic/THVectorDefault.c"
-#include "THGenerateAllTypes.h"
-
-#include "generic/THVectorDispatch.c"
-#include "THGenerateAllTypes.h"
diff --git a/contrib/lua-torch/torch7/lib/TH/THVector.h b/contrib/lua-torch/torch7/lib/TH/THVector.h
deleted file mode 100644
index e29917b93..000000000
--- a/contrib/lua-torch/torch7/lib/TH/THVector.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef TH_VECTOR_INC
-#define TH_VECTOR_INC
-
-#include "THGeneral.h"
-
-#define THVector_(NAME) TH_CONCAT_4(TH,Real,Vector_,NAME)
-
-/* We are going to use dynamic dispatch, and want only to generate declarations
- * of the vector functions */
-#include "generic/THVector.h"
-#include "THGenerateAllTypes.h"
-
-#endif // TH_VECTOR_INC
diff --git a/contrib/lua-torch/torch7/lib/TH/cmake/FindARM.cmake b/contrib/lua-torch/torch7/lib/TH/cmake/FindARM.cmake
deleted file mode 100644
index 2dcb2a24f..000000000
--- a/contrib/lua-torch/torch7/lib/TH/cmake/FindARM.cmake
+++ /dev/null
@@ -1,76 +0,0 @@
-# Check if the processor is an ARM and if Neon instruction are available on the machine where
-# the project is compiled.
-
-IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
-   EXEC_PROGRAM(cat ARGS "/proc/cpuinfo" OUTPUT_VARIABLE CPUINFO)
-
-   #neon instruction can be found on the majority part of modern ARM processor
-   STRING(REGEX REPLACE "^.*(neon).*$" "\\1" NEON_THERE ${CPUINFO})
-   STRING(COMPARE EQUAL "neon" "${NEON_THERE}" NEON_TRUE)
-   IF (NEON_TRUE)
-      set(NEON_FOUND true CACHE BOOL "NEON available on host")
-   ELSE (NEON_TRUE)
-      set(NEON_FOUND false CACHE BOOL "NEON available on host")
-   ENDIF (NEON_TRUE)
-
-   # on ARMv8, neon is inherit and instead listed as 'asimd' in /proc/cpuinfo
-   STRING(REGEX REPLACE "^.*(asimd).*$" "\\1" ASIMD_THERE ${CPUINFO})
-   STRING(COMPARE EQUAL "asimd" "${ASIMD_THERE}" ASIMD_TRUE)
-   IF (ASIMD_TRUE)
-      set(ASIMD_FOUND true CACHE BOOL "ASIMD/NEON available on host")
-   ELSE (ASIMD_TRUE)
-      set(ASIMD_FOUND false CACHE BOOL "ASIMD/NEON available on host")
-   ENDIF (ASIMD_TRUE)
-
-   #Find the processor type (for now OMAP3 or OMAP4)
-   STRING(REGEX REPLACE "^.*(OMAP3).*$" "\\1" OMAP3_THERE ${CPUINFO})
-   STRING(COMPARE EQUAL "OMAP3" "${OMAP3_THERE}" OMAP3_TRUE)
-   IF (OMAP3_TRUE)
-      set(CORTEXA8_FOUND true CACHE BOOL "OMAP3 available on host")
-   ELSE (OMAP3_TRUE)
-      set(CORTEXA8_FOUND false CACHE BOOL "OMAP3 available on host")
-   ENDIF (OMAP3_TRUE)
-
-   #Find the processor type (for now OMAP3 or OMAP4)
-   STRING(REGEX REPLACE "^.*(OMAP4).*$" "\\1" OMAP4_THERE ${CPUINFO})
-   STRING(COMPARE EQUAL "OMAP4" "${OMAP4_THERE}" OMAP4_TRUE)
-   IF (OMAP4_TRUE)
-      set(CORTEXA9_FOUND true CACHE BOOL "OMAP4 available on host")
-   ELSE (OMAP4_TRUE)
-      set(CORTEXA9_FOUND false CACHE BOOL "OMAP4 available on host")
-   ENDIF (OMAP4_TRUE)
-
-ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin")
-   EXEC_PROGRAM("/usr/sbin/sysctl -n machdep.cpu.features" OUTPUT_VARIABLE
-      CPUINFO)
-
-   #neon instruction can be found on the majority part of modern ARM processor
-   STRING(REGEX REPLACE "^.*(neon).*$" "\\1" NEON_THERE ${CPUINFO})
-   STRING(COMPARE EQUAL "neon" "${NEON_THERE}" NEON_TRUE)
-   IF (NEON_TRUE)
-      set(NEON_FOUND true CACHE BOOL "NEON available on host")
-   ELSE (NEON_TRUE)
-      set(NEON_FOUND false CACHE BOOL "NEON available on host")
-   ENDIF (NEON_TRUE)
-
-ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Windows")
-   # TODO
-   set(CORTEXA8_FOUND   false CACHE BOOL "OMAP3 not available on host")
-   set(CORTEXA9_FOUND   false CACHE BOOL "OMAP4 not available on host")
-   set(NEON_FOUND   false CACHE BOOL "NEON not available on host")
-ELSE(CMAKE_SYSTEM_NAME MATCHES "Linux")
-   set(CORTEXA8_FOUND   false CACHE BOOL "OMAP3 not available on host")
-   set(CORTEXA9_FOUND   false CACHE BOOL "OMAP4 not available on host")
-   set(NEON_FOUND   false CACHE BOOL "NEON not available on host")
-ENDIF(CMAKE_SYSTEM_NAME MATCHES "Linux")
-
-if(NOT NEON_FOUND)
-      MESSAGE(STATUS "Could not find hardware support for NEON on this machine.")
-endif(NOT NEON_FOUND)
-if(NOT CORTEXA8_FOUND)
-      MESSAGE(STATUS "No OMAP3 processor on this machine.")
-endif(NOT CORTEXA8_FOUND)
-if(NOT CORTEXA9_FOUND)
-      MESSAGE(STATUS "No OMAP4 processor on this machine.")
-endif(NOT CORTEXA9_FOUND)
-mark_as_advanced(NEON_FOUND)
diff --git a/contrib/lua-torch/torch7/lib/TH/cmake/FindBLAS.cmake b/contrib/lua-torch/torch7/lib/TH/cmake/FindBLAS.cmake
deleted file mode 100644
index 1f254d231..000000000
--- a/contrib/lua-torch/torch7/lib/TH/cmake/FindBLAS.cmake
+++ /dev/null
@@ -1,309 +0,0 @@
-# - Find BLAS library
-# This module finds an installed fortran library that implements the BLAS
-# linear-algebra interface (see http://www.netlib.org/blas/).
-# The list of libraries searched for is taken
-# from the autoconf macro file, acx_blas.m4 (distributed at
-# http://ac-archive.sourceforge.net/ac-archive/acx_blas.html).
-#
-# This module sets the following variables:
-#  BLAS_FOUND - set to true if a library implementing the BLAS interface is found.
-#  BLAS_INFO - name of the detected BLAS library.
-#  BLAS_F2C - set to true if following the f2c return convention
-#  BLAS_LIBRARIES - list of libraries to link against to use BLAS
-#  BLAS_INCLUDE_DIR - include directory
-
-# Do nothing is BLAS was found before
-IF(NOT BLAS_FOUND)
-
-SET(BLAS_LIBRARIES)
-SET(BLAS_INCLUDE_DIR)
-SET(BLAS_INFO)
-SET(BLAS_F2C)
-
-SET(WITH_BLAS "" CACHE STRING "Blas type [mkl/open/goto/acml/atlas/accelerate/veclib/generic]")
-
-# Old FindBlas
-INCLUDE(CheckCSourceRuns)
-INCLUDE(CheckFortranFunctionExists)
-
-MACRO(Check_Fortran_Libraries LIBRARIES _prefix _name _flags _list)
-  # This macro checks for the existence of the combination of fortran libraries
-  # given by _list.  If the combination is found, this macro checks (using the
-  # Check_Fortran_Function_Exists macro) whether can link against that library
-  # combination using the name of a routine given by _name using the linker
-  # flags given by _flags.  If the combination of libraries is found and passes
-  # the link test, LIBRARIES is set to the list of complete library paths that
-  # have been found.  Otherwise, LIBRARIES is set to NOTFOUND.
-  # N.B. _prefix is the prefix applied to the names of all cached variables that
-  # are generated internally and marked advanced by this macro.
-
-  set(__list)
-  foreach(_elem ${_list})
-    if(__list)
-      set(__list "${__list} - ${_elem}")
-    else(__list)
-      set(__list "${_elem}")
-    endif(__list)
-  endforeach(_elem)
-  message(STATUS "Checking for [${__list}]")
-
-  set(_libraries_work TRUE)
-  set(${LIBRARIES})
-  set(_combined_name)
-  foreach(_library ${_list})
-    set(_combined_name ${_combined_name}_${_library})
-    if(_libraries_work)
-      if ( WIN32 )
-        find_library(${_prefix}_${_library}_LIBRARY
-          NAMES ${_library}
-          PATHS ENV LIB
-          PATHS ENV PATH )
-      endif ( WIN32 )
-      if ( APPLE )
-        find_library(${_prefix}_${_library}_LIBRARY
-          NAMES ${_library}
-          PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64
-          ENV DYLD_LIBRARY_PATH )
-      else ( APPLE )
-        find_library(${_prefix}_${_library}_LIBRARY
-          NAMES ${_library}
-          PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64
-          ENV LD_LIBRARY_PATH )
-      endif( APPLE )
-      mark_as_advanced(${_prefix}_${_library}_LIBRARY)
-      set(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY})
-      set(_libraries_work ${${_prefix}_${_library}_LIBRARY})
-      MESSAGE(STATUS "  Library ${_library}: ${${_prefix}_${_library}_LIBRARY}")
-    endif(_libraries_work)
-  endforeach(_library ${_list})
-  if(_libraries_work)
-    # Test this combination of libraries.
-    set(CMAKE_REQUIRED_LIBRARIES ${_flags} ${${LIBRARIES}})
-    if (CMAKE_Fortran_COMPILER_WORKS)
-      check_fortran_function_exists(${_name} ${_prefix}${_combined_name}_WORKS)
-    else (CMAKE_Fortran_COMPILER_WORKS)
-      check_function_exists("${_name}_" ${_prefix}${_combined_name}_WORKS)
-    endif (CMAKE_Fortran_COMPILER_WORKS)
-    set(CMAKE_REQUIRED_LIBRARIES)
-    mark_as_advanced(${_prefix}${_combined_name}_WORKS)
-    set(_libraries_work ${${_prefix}${_combined_name}_WORKS})
-  endif(_libraries_work)
-  if(NOT _libraries_work)
-    set(${LIBRARIES} NOTFOUND)
-  endif(NOT _libraries_work)
-endmacro(Check_Fortran_Libraries)
-
-# Intel MKL?
-if((NOT BLAS_LIBRARIES)
-    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "mkl")))
-  FIND_PACKAGE(MKL)
-  IF(MKL_FOUND)
-    SET(BLAS_INFO "mkl")
-    SET(BLAS_LIBRARIES ${MKL_LIBRARIES})
-    SET(BLAS_INCLUDE_DIR ${MKL_INCLUDE_DIR})
-    SET(BLAS_VERSION ${MKL_VERSION})
-  ENDIF(MKL_FOUND)
-endif()
-
-if((NOT BLAS_LIBRARIES)
-    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "open")))
-  check_fortran_libraries(
-  BLAS_LIBRARIES
-  BLAS
-  sgemm
-  ""
-  "openblas")
-  if(BLAS_LIBRARIES)
-    set(BLAS_INFO "open")
-  endif(BLAS_LIBRARIES)
-endif()
-
-if((NOT BLAS_LIBRARIES)
-    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "open")))
-  check_fortran_libraries(
-  BLAS_LIBRARIES
-  BLAS
-  sgemm
-  ""
-  "openblas;pthread")
-  if(BLAS_LIBRARIES)
-    set(BLAS_INFO "open")
-  endif(BLAS_LIBRARIES)
-endif()
-
-if((NOT BLAS_LIBRARIES) AND (WIN32)
-    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "open")))
-  check_fortran_libraries(
-  BLAS_LIBRARIES
-  BLAS
-  sgemm
-  ""
-  "libopenblas")
-  if(BLAS_LIBRARIES)
-    set(BLAS_INFO "open")
-  endif(BLAS_LIBRARIES)
-endif()
-
-if((NOT BLAS_LIBRARIES)
-    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "goto")))
-  check_fortran_libraries(
-  BLAS_LIBRARIES
-  BLAS
-  sgemm
-  ""
-  "goto2;gfortran")
-  if (BLAS_LIBRARIES)
-    set(BLAS_INFO "goto")
-  endif (BLAS_LIBRARIES)
-endif()
-
-if((NOT BLAS_LIBRARIES)
-    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "goto")))
-  check_fortran_libraries(
-  BLAS_LIBRARIES
-  BLAS
-  sgemm
-  ""
-  "goto2;gfortran;pthread")
-  if (BLAS_LIBRARIES)
-    set(BLAS_INFO "goto")
-  endif (BLAS_LIBRARIES)
-endif()
-
-if((NOT BLAS_LIBRARIES)
-    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "acml")))
-  check_fortran_libraries(
-  BLAS_LIBRARIES
-  BLAS
-  sgemm
-  ""
-  "acml;gfortran")
-  if (BLAS_LIBRARIES)
-    set(BLAS_INFO "acml")
-  endif (BLAS_LIBRARIES)
-endif()
-
-# Apple BLAS library?
-if((NOT BLAS_LIBRARIES)
-    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "accelerate")))
-  check_fortran_libraries(
-  BLAS_LIBRARIES
-  BLAS
-  sgemm
-  ""
-  "Accelerate")
-  if (BLAS_LIBRARIES)
-    set(BLAS_INFO "accelerate")
-    set(BLAS_IS_ACCELERATE 1)
-  endif (BLAS_LIBRARIES)
-endif()
-
-if((NOT BLAS_LIBRARIES)
-    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "veclib")))
-  check_fortran_libraries(
-    BLAS_LIBRARIES
-    BLAS
-    sgemm
-    ""
-    "vecLib")
-  if (BLAS_LIBRARIES)
-    set(BLAS_INFO "veclib")
-  endif (BLAS_LIBRARIES)
-endif()
-
-# BLAS in ATLAS library? (http://math-atlas.sourceforge.net/)
-if((NOT BLAS_LIBRARIES)
-    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "atlas")))
-  check_fortran_libraries(
-  BLAS_LIBRARIES
-  BLAS
-  sgemm
-  ""
-  "ptf77blas;atlas;gfortran")
-  if (BLAS_LIBRARIES)
-    set(BLAS_INFO "atlas")
-  endif (BLAS_LIBRARIES)
-endif()
-
-# Generic BLAS library?
-if((NOT BLAS_LIBRARIES)
-    AND ((NOT WITH_BLAS) OR (WITH_BLAS STREQUAL "generic")))
-  check_fortran_libraries(
-  BLAS_LIBRARIES
-  BLAS
-  sgemm
-  ""
-  "blas")
-  if (BLAS_LIBRARIES)
-    check_fortran_libraries(
-            TMP_BLAS_LIBRARIES
-            TMP_BLAS
-            openblas_get_num_threads
-            ""
-            "blas")
-    if (TMP_BLAS_LIBRARIES)
-      set(BLAS_INFO "open")
-    else()
-      set(BLAS_INFO "generic")
-    endif()
-  endif (BLAS_LIBRARIES)
-endif()
-
-# Determine if blas was compiled with the f2c conventions
-IF (BLAS_LIBRARIES)
-  SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
-  CHECK_C_SOURCE_RUNS("
-#include <stdlib.h>
-#include <stdio.h>
-float x[4] = { 1, 2, 3, 4 };
-float y[4] = { .1, .01, .001, .0001 };
-int four = 4;
-int one = 1;
-extern double sdot_();
-int main() {
-  int i;
-  double r = sdot_(&four, x, &one, y, &one);
-  exit((float)r != (float).1234);
-}" BLAS_F2C_DOUBLE_WORKS )
-  CHECK_C_SOURCE_RUNS("
-#include <stdlib.h>
-#include <stdio.h>
-float x[4] = { 1, 2, 3, 4 };
-float y[4] = { .1, .01, .001, .0001 };
-int four = 4;
-int one = 1;
-extern float sdot_();
-int main() {
-  int i;
-  double r = sdot_(&four, x, &one, y, &one);
-  exit((float)r != (float).1234);
-}" BLAS_F2C_FLOAT_WORKS )
-  IF (BLAS_F2C_DOUBLE_WORKS AND NOT BLAS_F2C_FLOAT_WORKS)
-    MESSAGE(STATUS "This BLAS uses the F2C return conventions")
-    SET(BLAS_F2C TRUE)
-  ELSE (BLAS_F2C_DOUBLE_WORKS AND NOT BLAS_F2C_FLOAT_WORKS)
-    SET(BLAS_F2C FALSE)
-  ENDIF (BLAS_F2C_DOUBLE_WORKS AND NOT BLAS_F2C_FLOAT_WORKS)
-ENDIF(BLAS_LIBRARIES)
-
-# epilogue
-
-if(BLAS_LIBRARIES)
-  set(BLAS_FOUND TRUE)
-else(BLAS_LIBRARIES)
-  set(BLAS_FOUND FALSE)
-endif(BLAS_LIBRARIES)
-
-IF (NOT BLAS_FOUND AND BLAS_FIND_REQUIRED)
-  message(FATAL_ERROR "Cannot find a library with BLAS API. Please specify library location.")
-ENDIF (NOT BLAS_FOUND AND BLAS_FIND_REQUIRED)
-IF(NOT BLAS_FIND_QUIETLY)
-  IF(BLAS_FOUND)
-    MESSAGE(STATUS "Found a library with BLAS API (${BLAS_INFO}).")
-  ELSE(BLAS_FOUND)
-    MESSAGE(STATUS "Cannot find a library with BLAS API. Not using BLAS.")
-  ENDIF(BLAS_FOUND)
-ENDIF(NOT BLAS_FIND_QUIETLY)
-
-# Do nothing is BLAS was found before
-ENDIF(NOT BLAS_FOUND)
diff --git a/contrib/lua-torch/torch7/lib/TH/cmake/FindLAPACK.cmake b/contrib/lua-torch/torch7/lib/TH/cmake/FindLAPACK.cmake
deleted file mode 100644
index 9eca0730f..000000000
--- a/contrib/lua-torch/torch7/lib/TH/cmake/FindLAPACK.cmake
+++ /dev/null
@@ -1,190 +0,0 @@
-# - Find LAPACK library
-# This module finds an installed fortran library that implements the LAPACK
-# linear-algebra interface (see http://www.netlib.org/lapack/).
-#
-# The approach follows that taken for the autoconf macro file, acx_lapack.m4
-# (distributed at http://ac-archive.sourceforge.net/ac-archive/acx_lapack.html).
-#
-# This module sets the following variables:
-#  LAPACK_FOUND - set to true if a library implementing the LAPACK interface is found
-#  LAPACK_LIBRARIES - list of libraries (using full path name) for LAPACK
-
-# Note: I do not think it is a good idea to mixup different BLAS/LAPACK versions
-# Hence, this script wants to find a Lapack library matching your Blas library
-
-# Do nothing if LAPACK was found before
-IF(NOT LAPACK_FOUND)
-
-SET(LAPACK_LIBRARIES)
-SET(LAPACK_INFO)
-
-IF(LAPACK_FIND_QUIETLY OR NOT LAPACK_FIND_REQUIRED)
-  FIND_PACKAGE(BLAS)
-ELSE(LAPACK_FIND_QUIETLY OR NOT LAPACK_FIND_REQUIRED)
-  FIND_PACKAGE(BLAS REQUIRED)
-ENDIF(LAPACK_FIND_QUIETLY OR NOT LAPACK_FIND_REQUIRED)
-
-# Old search lapack script
-include(CheckFortranFunctionExists)
-
-macro(Check_Lapack_Libraries LIBRARIES _prefix _name _flags _list _blas)
-  # This macro checks for the existence of the combination of fortran libraries
-  # given by _list.  If the combination is found, this macro checks (using the
-  # Check_Fortran_Function_Exists macro) whether can link against that library
-  # combination using the name of a routine given by _name using the linker
-  # flags given by _flags.  If the combination of libraries is found and passes
-  # the link test, LIBRARIES is set to the list of complete library paths that
-  # have been found.  Otherwise, LIBRARIES is set to FALSE.
-  # N.B. _prefix is the prefix applied to the names of all cached variables that
-  # are generated internally and marked advanced by this macro.
-  set(_libraries_work TRUE)
-  set(${LIBRARIES})
-  set(_combined_name)
-  foreach(_library ${_list})
-    set(_combined_name ${_combined_name}_${_library})
-    if(_libraries_work)
-      if (WIN32)
-        find_library(${_prefix}_${_library}_LIBRARY
-          NAMES ${_library} PATHS ENV LIB PATHS ENV PATH)
-      else (WIN32)
-        if(APPLE)
-          find_library(${_prefix}_${_library}_LIBRARY
-            NAMES ${_library}
-            PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64
-            ENV DYLD_LIBRARY_PATH)
-        else(APPLE)
-          find_library(${_prefix}_${_library}_LIBRARY
-            NAMES ${_library}
-            PATHS /usr/local/lib /usr/lib /usr/local/lib64 /usr/lib64
-            ENV LD_LIBRARY_PATH)
-        endif(APPLE)
-      endif(WIN32)
-      mark_as_advanced(${_prefix}_${_library}_LIBRARY)
-      set(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY})
-      set(_libraries_work ${${_prefix}_${_library}_LIBRARY})
-    endif(_libraries_work)
-  endforeach(_library ${_list})
-  if(_libraries_work)
-    # Test this combination of libraries.
-    set(CMAKE_REQUIRED_LIBRARIES ${_flags} ${${LIBRARIES}} ${_blas})
-    if (CMAKE_Fortran_COMPILER_WORKS)
-      check_fortran_function_exists(${_name} ${_prefix}${_combined_name}_WORKS)
-    else (CMAKE_Fortran_COMPILER_WORKS)
-      check_function_exists("${_name}_" ${_prefix}${_combined_name}_WORKS)
-    endif (CMAKE_Fortran_COMPILER_WORKS)
-    set(CMAKE_REQUIRED_LIBRARIES)
-    mark_as_advanced(${_prefix}${_combined_name}_WORKS)
-    set(_libraries_work ${${_prefix}${_combined_name}_WORKS})
-  endif(_libraries_work)
-  if(NOT _libraries_work)
-    set(${LIBRARIES} FALSE)
-  endif(NOT _libraries_work)
-endmacro(Check_Lapack_Libraries)
-
-
-if(BLAS_FOUND)
-
-  # Intel MKL
-  IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "mkl"))
-    IF(MKL_LAPACK_LIBRARIES)
-      SET(LAPACK_LIBRARIES ${MKL_LAPACK_LIBRARIES} ${MKL_LIBRARIES})
-    ELSE(MKL_LAPACK_LIBRARIES)
-      SET(LAPACK_LIBRARIES ${MKL_LIBRARIES})
-    ENDIF(MKL_LAPACK_LIBRARIES)
-    SET(LAPACK_INCLUDE_DIR ${MKL_INCLUDE_DIR})
-    SET(LAPACK_INFO "mkl")
-  ENDIF()
-
-  # OpenBlas
-  IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "open"))
-    SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
-    check_function_exists("cheev_" OPEN_LAPACK_WORKS)
-    if(OPEN_LAPACK_WORKS)
-      SET(LAPACK_INFO "open")
-    else()
-      message(STATUS "It seems OpenBlas has not been compiled with Lapack support")
-    endif()
-  endif()
-
-  # GotoBlas
-  IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "goto"))
-    SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
-    check_function_exists("cheev_" GOTO_LAPACK_WORKS)
-    if(GOTO_LAPACK_WORKS)
-      SET(LAPACK_INFO "goto")
-    else()
-      message(STATUS "It seems GotoBlas has not been compiled with Lapack support")
-    endif()
-  endif()
-
-  # ACML
-  IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "acml"))
-    SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
-    check_function_exists("cheev_" ACML_LAPACK_WORKS)
-    if(ACML_LAPACK_WORKS)
-      SET(LAPACK_INFO "acml")
-    else()
-      message(STATUS "Strangely, this ACML library does not support Lapack?!")
-    endif()
-  endif()
-
-  # Accelerate
-  IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "accelerate"))
-    SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
-    check_function_exists("cheev_" ACCELERATE_LAPACK_WORKS)
-    if(ACCELERATE_LAPACK_WORKS)
-      SET(LAPACK_INFO "accelerate")
-    else()
-      message(STATUS "Strangely, this Accelerate library does not support Lapack?!")
-    endif()
-  endif()
-
-  # vecLib
-  IF((NOT LAPACK_INFO) AND (BLAS_INFO STREQUAL "veclib"))
-    SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES})
-    check_function_exists("cheev_" VECLIB_LAPACK_WORKS)
-    if(VECLIB_LAPACK_WORKS)
-      SET(LAPACK_INFO "veclib")
-    else()
-      message(STATUS "Strangely, this vecLib library does not support Lapack?!")
-    endif()
-  endif()
-
-  # Generic LAPACK library?
-  IF((NOT LAPACK_INFO) AND ((BLAS_INFO STREQUAL "generic") OR (BLAS_INFO STREQUAL "open")))
-    check_lapack_libraries(
-      LAPACK_LIBRARIES
-      LAPACK
-      cheev
-      ""
-      "lapack"
-      "${BLAS_LIBRARIES}"
-      )
-    if(LAPACK_LIBRARIES)
-      SET(LAPACK_INFO "generic")
-    endif(LAPACK_LIBRARIES)
-  endif()
-
-else(BLAS_FOUND)
-  message(STATUS "LAPACK requires BLAS")
-endif(BLAS_FOUND)
-
-if(LAPACK_INFO)
-  set(LAPACK_FOUND TRUE)
-else(LAPACK_INFO)
-  set(LAPACK_FOUND FALSE)
-endif(LAPACK_INFO)
-
-IF (NOT LAPACK_FOUND AND LAPACK_FIND_REQUIRED)
-  message(FATAL_ERROR "Cannot find a library with LAPACK API. Please specify library location.")
-ENDIF (NOT LAPACK_FOUND AND LAPACK_FIND_REQUIRED)
-IF(NOT LAPACK_FIND_QUIETLY)
-  IF(LAPACK_FOUND)
-    MESSAGE(STATUS "Found a library with LAPACK API. (${LAPACK_INFO})")
-  ELSE(LAPACK_FOUND)
-    MESSAGE(STATUS "Cannot find a library with LAPACK API. Not using LAPACK.")
-  ENDIF(LAPACK_FOUND)
-ENDIF(NOT LAPACK_FIND_QUIETLY)
-
-# Do nothing if LAPACK was found before
-ENDIF(NOT LAPACK_FOUND)
diff --git a/contrib/lua-torch/torch7/lib/TH/cmake/FindMKL.cmake b/contrib/lua-torch/torch7/lib/TH/cmake/FindMKL.cmake
deleted file mode 100644
index 08b450985..000000000
--- a/contrib/lua-torch/torch7/lib/TH/cmake/FindMKL.cmake
+++ /dev/null
@@ -1,272 +0,0 @@
-# - Find INTEL MKL library
-#
-# This module finds the Intel Mkl libraries.
-#
-# This module sets the following variables:
-#  MKL_FOUND - set to true if a library implementing the CBLAS interface is found
-#  MKL_VERSION - best guess
-#  MKL_INCLUDE_DIR - path to include dir.
-#  MKL_LIBRARIES - list of libraries for base mkl
-#  MKL_LAPACK_LIBRARIES - list of libraries to add for lapack
-#  MKL_SCALAPACK_LIBRARIES - list of libraries to add for scalapack
-#  MKL_SOLVER_LIBRARIES - list of libraries to add for the solvers
-#  MKL_CDFT_LIBRARIES - list of libraries to add for the solvers
-
-
-# Do nothing if MKL_FOUND was set before!
-IF (NOT MKL_FOUND)
-
-SET(MKL_VERSION)
-SET(MKL_INCLUDE_DIR)
-SET(MKL_LIBRARIES)
-SET(MKL_LAPACK_LIBRARIES)
-SET(MKL_SCALAPACK_LIBRARIES)
-SET(MKL_SOLVER_LIBRARIES)
-SET(MKL_CDFT_LIBRARIES)
-
-# Includes
-INCLUDE(CheckTypeSize)
-INCLUDE(CheckFunctionExists)
-
-# Intel Compiler Suite
-SET(INTEL_COMPILER_DIR CACHE STRING
-  "Root directory of the Intel Compiler Suite (contains ipp, mkl, etc.)")
-SET(INTEL_MKL_DIR CACHE STRING
-  "Root directory of the Intel MKL (standalone)")
-SET(INTEL_MKL_SEQUENTIAL OFF CACHE BOOL
-  "Force using the sequential (non threaded) libraries")
-
-# Checks
-CHECK_TYPE_SIZE("void*" SIZE_OF_VOIDP)
-IF ("${SIZE_OF_VOIDP}" EQUAL 8)
-  SET(mklvers "em64t")
-  SET(iccvers "intel64")
-  SET(mkl64s "_lp64")
-ELSE ("${SIZE_OF_VOIDP}" EQUAL 8)
-  SET(mklvers "32")
-  SET(iccvers "ia32")
-  SET(mkl64s)
-ENDIF ("${SIZE_OF_VOIDP}" EQUAL 8)
-IF(CMAKE_COMPILER_IS_GNUCC)
-  SET(mklthreads "mkl_gnu_thread" "mkl_intel_thread")
-  SET(mklifaces  "gf" "intel")
-  SET(mklrtls "iomp5")
-ELSE(CMAKE_COMPILER_IS_GNUCC)
-  SET(mklthreads "mkl_intel_thread")
-  SET(mklifaces  "intel")
-  SET(mklrtls "iomp5" "guide")
-  IF (MSVC)
-    SET(mklrtls "libiomp5md")
-  ENDIF (MSVC)
-ENDIF (CMAKE_COMPILER_IS_GNUCC)
-
-# Kernel libraries dynamically loaded
-SET(mklkerlibs "mc" "mc3" "nc" "p4n" "p4m" "p4m3" "p4p" "def")
-SET(mklseq)
-
-
-
-# Paths
-SET(saved_CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH})
-SET(saved_CMAKE_INCLUDE_PATH ${CMAKE_INCLUDE_PATH})
-IF (INTEL_COMPILER_DIR)
-  # TODO: diagnostic if dir does not exist
-  SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH}
-    "${INTEL_COMPILER_DIR}/lib/${iccvers}")
-  IF (NOT INTEL_MKL_DIR)
-    SET(INTEL_MKL_DIR "${INTEL_COMPILER_DIR}/mkl")
-  ENDIF (NOT INTEL_MKL_DIR)
-ENDIF (INTEL_COMPILER_DIR)
-IF (INTEL_MKL_DIR)
-  # TODO: diagnostic if dir does not exist
-  SET(CMAKE_INCLUDE_PATH ${CMAKE_INCLUDE_PATH}
-    "${INTEL_MKL_DIR}/include")
-  SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH}
-    "${INTEL_MKL_DIR}/lib/${mklvers}")
-  IF (MSVC)
-    SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH}
-      "${INTEL_MKL_DIR}/lib/${iccvers}")
-  ENDIF (MSVC)
-ENDIF (INTEL_MKL_DIR)
-
-# Try linking multiple libs
-MACRO(CHECK_ALL_LIBRARIES LIBRARIES _name _list _flags)
-  # This macro checks for the existence of the combination of libraries given by _list.
-  # If the combination is found, this macro whether we can link against that library
-  # combination using the name of a routine given by _name using the linker
-  # flags given by _flags.  If the combination of libraries is found and passes
-  # the link test, LIBRARIES is set to the list of complete library paths that
-  # have been found.  Otherwise, LIBRARIES is set to FALSE.
-  # N.B. _prefix is the prefix applied to the names of all cached variables that
-  # are generated internally and marked advanced by this macro.
-  SET(_prefix "${LIBRARIES}")
-  # start checking
-  SET(_libraries_work TRUE)
-  SET(${LIBRARIES})
-  SET(_combined_name)
-  SET(_paths)
-  set(__list)
-  foreach(_elem ${_list})
-    if(__list)
-      set(__list "${__list} - ${_elem}")
-    else(__list)
-      set(__list "${_elem}")
-    endif(__list)
-  endforeach(_elem)
-  message(STATUS "Checking for [${__list}]")
-  FOREACH(_library ${_list})
-    SET(_combined_name ${_combined_name}_${_library})
-    IF(_libraries_work)
-      FIND_LIBRARY(${_prefix}_${_library}_LIBRARY NAMES ${_library})
-      MARK_AS_ADVANCED(${_prefix}_${_library}_LIBRARY)
-      SET(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY})
-      SET(_libraries_work ${${_prefix}_${_library}_LIBRARY})
-      IF(${_prefix}_${_library}_LIBRARY)
-        MESSAGE(STATUS "  Library ${_library}: ${${_prefix}_${_library}_LIBRARY}")
-      ELSE(${_prefix}_${_library}_LIBRARY)
-        MESSAGE(STATUS "  Library ${_library}: not found")
-      ENDIF(${_prefix}_${_library}_LIBRARY)
-    ENDIF(_libraries_work)
-  ENDFOREACH(_library ${_list})
-  # Test this combination of libraries.
-  IF(_libraries_work)
-    SET(CMAKE_REQUIRED_LIBRARIES ${_flags} ${${LIBRARIES}})
-    CHECK_FUNCTION_EXISTS(${_name} ${_prefix}${_combined_name}_WORKS)
-    SET(CMAKE_REQUIRED_LIBRARIES)
-    MARK_AS_ADVANCED(${_prefix}${_combined_name}_WORKS)
-    SET(_libraries_work ${${_prefix}${_combined_name}_WORKS})
-  ENDIF(_libraries_work)
-  # Fin
-  IF(_libraries_work)
-  ELSE (_libraries_work)
-    SET(${LIBRARIES})
-    MARK_AS_ADVANCED(${LIBRARIES})
-  ENDIF(_libraries_work)
-ENDMACRO(CHECK_ALL_LIBRARIES)
-
-if(WIN32)
-  set(mkl_m "")
-else(WIN32)
-  set(mkl_m "m")
-endif(WIN32)
-
-
-# Check for version 10/11
-IF (NOT MKL_LIBRARIES)
-  SET(MKL_VERSION 1011)
-ENDIF (NOT MKL_LIBRARIES)
-FOREACH(mklrtl ${mklrtls} "")
-  FOREACH(mkliface ${mklifaces})
-    FOREACH(mkl64 ${mkl64s} "")
-      FOREACH(mklthread ${mklthreads})
-        IF (NOT MKL_LIBRARIES AND NOT INTEL_MKL_SEQUENTIAL)
-          CHECK_ALL_LIBRARIES(MKL_LIBRARIES cblas_sgemm
-            "mkl_${mkliface}${mkl64};${mklthread};mkl_core;${mklrtl};pthread;${mkl_m}" "")
-        ENDIF (NOT MKL_LIBRARIES AND NOT INTEL_MKL_SEQUENTIAL)
-      ENDFOREACH(mklthread)
-    ENDFOREACH(mkl64)
-  ENDFOREACH(mkliface)
-ENDFOREACH(mklrtl)
-FOREACH(mklrtl ${mklrtls} "")
-  FOREACH(mkliface ${mklifaces})
-    FOREACH(mkl64 ${mkl64s} "")
-      IF (NOT MKL_LIBRARIES)
-        CHECK_ALL_LIBRARIES(MKL_LIBRARIES cblas_sgemm
-          "mkl_${mkliface}${mkl64};mkl_sequential;mkl_core;${mkl_m}" "")
-        IF (MKL_LIBRARIES)
-          SET(mklseq "_sequential")
-        ENDIF (MKL_LIBRARIES)
-      ENDIF (NOT MKL_LIBRARIES)
-    ENDFOREACH(mkl64)
-  ENDFOREACH(mkliface)
-ENDFOREACH(mklrtl)
-FOREACH(mklrtl ${mklrtls} "")
-  FOREACH(mkliface ${mklifaces})
-    FOREACH(mkl64 ${mkl64s} "")
-      FOREACH(mklthread ${mklthreads})
-        IF (NOT MKL_LIBRARIES)
-          CHECK_ALL_LIBRARIES(MKL_LIBRARIES cblas_sgemm
-            "mkl_${mkliface}${mkl64};${mklthread};mkl_core;${mklrtl};pthread;${mkl_m}" "")
-        ENDIF (NOT MKL_LIBRARIES)
-      ENDFOREACH(mklthread)
-    ENDFOREACH(mkl64)
-  ENDFOREACH(mkliface)
-ENDFOREACH(mklrtl)
-
-# Check for older versions
-IF (NOT MKL_LIBRARIES)
-  SET(MKL_VERSION 900)
-  CHECK_ALL_LIBRARIES(MKL_LIBRARIES cblas_sgemm
-    "mkl;guide;pthread;m" "")
-ENDIF (NOT MKL_LIBRARIES)
-
-# Include files
-IF (MKL_LIBRARIES)
-  FIND_PATH(MKL_INCLUDE_DIR "mkl_cblas.h")
-  MARK_AS_ADVANCED(MKL_INCLUDE_DIR)
-ENDIF (MKL_LIBRARIES)
-
-# Other libraries
-IF (MKL_LIBRARIES)
-  FOREACH(mkl64 ${mkl64s} "_core" "")
-    FOREACH(mkls ${mklseq} "")
-      IF (NOT MKL_LAPACK_LIBRARIES)
-        FIND_LIBRARY(MKL_LAPACK_LIBRARIES NAMES "mkl_lapack${mkl64}${mkls}")
-        MARK_AS_ADVANCED(MKL_LAPACK_LIBRARIES)
-      ENDIF (NOT MKL_LAPACK_LIBRARIES)
-      IF (NOT MKL_SCALAPACK_LIBRARIES)
-        FIND_LIBRARY(MKL_SCALAPACK_LIBRARIES NAMES "mkl_scalapack${mkl64}${mkls}")
-        MARK_AS_ADVANCED(MKL_SCALAPACK_LIBRARIES)
-      ENDIF (NOT MKL_SCALAPACK_LIBRARIES)
-      IF (NOT MKL_SOLVER_LIBRARIES)
-        FIND_LIBRARY(MKL_SOLVER_LIBRARIES NAMES "mkl_solver${mkl64}${mkls}")
-        MARK_AS_ADVANCED(MKL_SOLVER_LIBRARIES)
-      ENDIF (NOT MKL_SOLVER_LIBRARIES)
-      IF (NOT MKL_CDFT_LIBRARIES)
-        FIND_LIBRARY(MKL_CDFT_LIBRARIES NAMES "mkl_cdft${mkl64}${mkls}")
-        MARK_AS_ADVANCED(MKL_CDFT_LIBRARIES)
-      ENDIF (NOT MKL_CDFT_LIBRARIES)
-    ENDFOREACH(mkls)
-  ENDFOREACH(mkl64)
-ENDIF (MKL_LIBRARIES)
-
-# LibIRC: intel compiler always links this;
-# gcc does not; but mkl kernels sometimes need it.
-IF (MKL_LIBRARIES)
-  IF (CMAKE_COMPILER_IS_GNUCC)
-    FIND_LIBRARY(MKL_KERNEL_libirc "irc")
-  ELSEIF (CMAKE_C_COMPILER_ID AND NOT CMAKE_C_COMPILER_ID STREQUAL "Intel")
-    FIND_LIBRARY(MKL_KERNEL_libirc "irc")
-  ENDIF (CMAKE_COMPILER_IS_GNUCC)
-  MARK_AS_ADVANCED(MKL_KERNEL_libirc)
-  IF (MKL_KERNEL_libirc)
-    SET(MKL_LIBRARIES ${MKL_LIBRARIES} ${MKL_KERNEL_libirc})
-  ENDIF (MKL_KERNEL_libirc)
-ENDIF (MKL_LIBRARIES)
-
-# Final
-SET(CMAKE_LIBRARY_PATH ${saved_CMAKE_LIBRARY_PATH})
-SET(CMAKE_INCLUDE_PATH ${saved_CMAKE_INCLUDE_PATH})
-IF (MKL_LIBRARIES)
-  SET(MKL_FOUND TRUE)
-ELSE (MKL_LIBRARIES)
-  SET(MKL_FOUND FALSE)
-  SET(MKL_VERSION)
-ENDIF (MKL_LIBRARIES)
-
-# Standard termination
-IF(NOT MKL_FOUND AND MKL_FIND_REQUIRED)
-  MESSAGE(FATAL_ERROR "MKL library not found. Please specify library  location")
-ENDIF(NOT MKL_FOUND AND MKL_FIND_REQUIRED)
-IF(NOT MKL_FIND_QUIETLY)
-  IF(MKL_FOUND)
-    MESSAGE(STATUS "MKL library found")
-  ELSE(MKL_FOUND)
-    MESSAGE(STATUS "MKL library not found")
-  ENDIF(MKL_FOUND)
-ENDIF(NOT MKL_FIND_QUIETLY)
-
-# Do nothing if MKL_FOUND was set before!
-ENDIF (NOT MKL_FOUND)
-
-
diff --git a/contrib/lua-torch/torch7/lib/TH/cmake/FindSSE.cmake b/contrib/lua-torch/torch7/lib/TH/cmake/FindSSE.cmake
deleted file mode 100644
index a14abe8d4..000000000
--- a/contrib/lua-torch/torch7/lib/TH/cmake/FindSSE.cmake
+++ /dev/null
@@ -1,125 +0,0 @@
-INCLUDE(CheckCSourceRuns)
-INCLUDE(CheckCXXSourceRuns)
-
-SET(SSE1_CODE "
-  #include <xmmintrin.h>
-
-  int main()
-  {
-    __m128 a;
-    float vals[4] = {0,0,0,0};
-    a = _mm_loadu_ps(vals);
-    return 0;
-  }")
-
-SET(SSE2_CODE "
-  #include <emmintrin.h>
-
-  int main()
-  {
-    __m128d a;
-    double vals[2] = {0,0};
-    a = _mm_loadu_pd(vals);
-    return 0;
-  }")
-
-SET(SSE3_CODE "
-  #include <pmmintrin.h>
-
-  int main( )
-  {
-    const int vals[4] = {0,0,0,0};
-    __m128i a;
-    a = _mm_lddqu_si128( (const __m128i*)vals );
-    return 0;
-  }")
-
-SET(SSE4_1_CODE "
-  #include <smmintrin.h>
-
-  int main ()
-  {
-    __m128i a = {0,0,0,0}, b = {0,0,0,0};
-    __m128i res = _mm_max_epi8(a, b);
-
-    return 0;
-  }
-")
-
-SET(SSE4_2_CODE "
-  #include <nmmintrin.h>
-
-  int main()
-  {
-    __m128i a = {0,0,0,0}, b = {0,0,0,0}, c = {0,0,0,0};
-    c = _mm_cmpgt_epi64(a, b);
-    return 0;
-  }
-")
-
-SET(AVX_CODE "
-  #include <immintrin.h>
-
-  int main()
-  {
-    __m256 a;
-    a = _mm256_set1_ps(0);
-    return 0;
-  }
-")
-
-SET(AVX2_CODE "
-  #include <immintrin.h>
-
-  int main()
-  {
-    __m256i a = {0};
-    a = _mm256_abs_epi16(a);
-    return 0;
-  }
-")
-
-MACRO(CHECK_SSE lang type flags)
-  SET(__FLAG_I 1)
-  SET(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
-  FOREACH(__FLAG ${flags})
-    IF(NOT ${lang}_${type}_FOUND)
-      SET(CMAKE_REQUIRED_FLAGS ${__FLAG})
-      IF(lang STREQUAL "CXX")
-        CHECK_CXX_SOURCE_RUNS("${${type}_CODE}" ${lang}_HAS_${type}_${__FLAG_I})
-      ELSE()
-        CHECK_C_SOURCE_RUNS("${${type}_CODE}" ${lang}_HAS_${type}_${__FLAG_I})
-      ENDIF()
-      IF(${lang}_HAS_${type}_${__FLAG_I})
-        SET(${lang}_${type}_FOUND TRUE CACHE BOOL "${lang} ${type} support")
-        SET(${lang}_${type}_FLAGS "${__FLAG}" CACHE STRING "${lang} ${type} flags")
-      ENDIF()
-      MATH(EXPR __FLAG_I "${__FLAG_I}+1")
-    ENDIF()
-  ENDFOREACH()
-  SET(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
-
-  IF(NOT ${lang}_${type}_FOUND)
-    SET(${lang}_${type}_FOUND FALSE CACHE BOOL "${lang} ${type} support")
-    SET(${lang}_${type}_FLAGS "" CACHE STRING "${lang} ${type} flags")
-  ENDIF()
-
-  MARK_AS_ADVANCED(${lang}_${type}_FOUND ${lang}_${type}_FLAGS)
-
-ENDMACRO()
-
-CHECK_SSE(C "SSE1" " ;-msse;/arch:SSE")
-CHECK_SSE(C "SSE2" " ;-msse2;/arch:SSE2")
-CHECK_SSE(C "SSE3" " ;-msse3;/arch:SSE3")
-CHECK_SSE(C "SSE4_1" " ;-msse4.1;-msse4;/arch:SSE4")
-CHECK_SSE(C "SSE4_2" " ;-msse4.2;-msse4;/arch:SSE4")
-CHECK_SSE(C "AVX" " ;-mavx;/arch:AVX")
-CHECK_SSE(C "AVX2" " ;-mavx2 -mfma;/arch:AVX2")
-
-CHECK_SSE(CXX "SSE1" " ;-msse;/arch:SSE")
-CHECK_SSE(CXX "SSE2" " ;-msse2;/arch:SSE2")
-CHECK_SSE(CXX "SSE3" " ;-msse3;/arch:SSE3")
-CHECK_SSE(CXX "SSE4_1" " ;-msse4.1;-msse4;/arch:SSE4")
-CHECK_SSE(CXX "SSE4_2" " ;-msse4.2;-msse4;/arch:SSE4")
-CHECK_SSE(CXX "AVX" " ;-mavx;/arch:AVX")
-CHECK_SSE(CXX "AVX2" " ;-mavx2 -mfma;/arch:AVX2")
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/THBlas.c b/contrib/lua-torch/torch7/lib/TH/generic/THBlas.c
deleted file mode 100644
index b04931f34..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/THBlas.c
+++ /dev/null
@@ -1,412 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/THBlas.c"
-#else
-
-
-#ifdef BLAS_F2C
-# define ffloat double
-#else
-# define ffloat float
-#endif
-
-TH_EXTERNC void dswap_(int *n, double *x, int *incx, double *y, int *incy);
-TH_EXTERNC void sswap_(int *n, float *x, int *incx, float *y, int *incy);
-TH_EXTERNC void dscal_(int *n, double *a, double *x, int *incx);
-TH_EXTERNC void sscal_(int *n, float *a, float *x, int *incx);
-TH_EXTERNC void dcopy_(int *n, double *x, int *incx, double *y, int *incy);
-TH_EXTERNC void scopy_(int *n, float *x, int *incx, float *y, int *incy);
-TH_EXTERNC void daxpy_(int *n, double *a, double *x, int *incx, double *y, int *incy);
-TH_EXTERNC void saxpy_(int *n, float *a, float *x, int *incx, float *y, int *incy);
-TH_EXTERNC double ddot_(int *n, double *x, int *incx, double *y, int *incy);
-TH_EXTERNC ffloat sdot_(int *n, float *x, int *incx, float *y, int *incy);
-TH_EXTERNC void dgemv_(char *trans, int *m, int *n, double *alpha, double *a, int *lda, double *x, int *incx, double *beta, double *y, int *incy);
-TH_EXTERNC void sgemv_(char *trans, int *m, int *n, float *alpha, float *a, int *lda, float *x, int *incx, float *beta, float *y, int *incy);
-TH_EXTERNC void dger_(int *m, int *n, double *alpha, double *x, int *incx, double *y, int *incy, double *a, int *lda);
-TH_EXTERNC void sger_(int *m, int *n, float *alpha, float *x, int *incx, float *y, int *incy, float *a, int *lda);
-TH_EXTERNC void dgemm_(char *transa, char *transb, int *m, int *n, int *k, double *alpha, double *a, int *lda, double *b, int *ldb, double *beta, double *c, int *ldc);
-TH_EXTERNC void sgemm_(char *transa, char *transb, int *m, int *n, int *k, float *alpha, float *a, int *lda, float *b, int *ldb, float *beta, float *c, int *ldc);
-
-
-
-void THBlas_(swap)(long n, real *x, long incx, real *y, long incy)
-{
-  if(n == 1)
-  {
-    incx = 1;
-    incy = 1;
-  }
-
-#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
-  if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
-  {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
-
-#if defined(TH_REAL_IS_DOUBLE)
-    dswap_(&i_n, x, &i_incx, y, &i_incy);
-#else
-    sswap_(&i_n, x, &i_incx, y, &i_incy);
-#endif
-    return;
-  }
-#endif
-  {
-    long i;
-    for(i = 0; i < n; i++)
-    {
-      real z = x[i*incx];
-      x[i*incx] = y[i*incy];
-      y[i*incy] = z;
-    }
-  }
-}
-
-void THBlas_(scal)(long n, real a, real *x, long incx)
-{
-  if(n == 1)
-    incx = 1;
-
-#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
-  if( (n <= INT_MAX) && (incx <= INT_MAX) )
-  {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
-
-#if defined(TH_REAL_IS_DOUBLE)
-    dscal_(&i_n, &a, x, &i_incx);
-#else
-    sscal_(&i_n, &a, x, &i_incx);
-#endif
-    return;
-  }
-#endif
-  {
-    long i;
-    for(i = 0; i < n; i++) {
-      if (a == 0) {
-        x[i*incx] = 0;
-      } else {
-        x[i*incx] *= a;
-      }
-    }
-  }
-}
-
-void THBlas_(copy)(long n, real *x, long incx, real *y, long incy)
-{
-  if(n == 1)
-  {
-    incx = 1;
-    incy = 1;
-  }
-
-#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
-  if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
-  {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
-
-#if defined(TH_REAL_IS_DOUBLE)
-    dcopy_(&i_n, x, &i_incx, y, &i_incy);
-#else
-    scopy_(&i_n, x, &i_incx, y, &i_incy);
-#endif
-    return;
-  }
-#endif
-  {
-    long i;
-    for(i = 0; i < n; i++)
-      y[i*incy] = x[i*incx];
-  }
-}
-
-void THBlas_(axpy)(long n, real a, real *x, long incx, real *y, long incy)
-{
-  if(n == 1)
-  {
-    incx = 1;
-    incy = 1;
-  }
-
-#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
-  if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
-  {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
-
-#if defined(TH_REAL_IS_DOUBLE)
-    daxpy_(&i_n, &a, x, &i_incx, y, &i_incy);
-#else
-    saxpy_(&i_n, &a, x, &i_incx, y, &i_incy);
-#endif
-    return;
-  }
-#endif
-  {
-    long i;
-    for(i = 0; i < n; i++)
-      y[i*incy] += a*x[i*incx];
-  }
-}
-
-real THBlas_(dot)(long n, real *x, long incx, real *y, long incy)
-{
-  if(n == 1)
-  {
-    incx = 1;
-    incy = 1;
-  }
-
-#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
-  if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
-  {
-    int i_n = (int)n;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
-
-#if defined(TH_REAL_IS_DOUBLE)
-    return (real) ddot_(&i_n, x, &i_incx, y, &i_incy);
-#else
-    return (real) sdot_(&i_n, x, &i_incx, y, &i_incy);
-#endif
-  }
-#endif
-  {
-    long i;
-    real sum = 0;
-    for(i = 0; i < n; i++)
-    sum += x[i*incx]*y[i*incy];
-    return sum;
-  }
-}
-
-void THBlas_(gemv)(char trans, long m, long n, real alpha, real *a, long lda, real *x, long incx, real beta, real *y, long incy)
-{
-  if(n == 1)
-    lda = m;
-
-#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
-  if( (m <= INT_MAX) && (n <= INT_MAX) &&
-      (lda > 0) && (lda <= INT_MAX) &&
-      (incx > 0) && (incx <= INT_MAX) &&
-      (incy > 0) && (incy <= INT_MAX) )
-  {
-    int i_m = (int)m;
-    int i_n = (int)n;
-    int i_lda = (int)lda;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
-
-#if defined(TH_REAL_IS_DOUBLE)
-    dgemv_(&trans, &i_m, &i_n, &alpha, a, &i_lda, x, &i_incx, &beta, y, &i_incy);
-#else
-    sgemv_(&trans, &i_m, &i_n, &alpha, a, &i_lda, x, &i_incx, &beta, y, &i_incy);
-#endif
-    return;
-  }
-#endif
-  {
-    long i, j;
-
-    if( (trans == 'T') || (trans == 't') )
-    {
-      for(i = 0; i < n; i++)
-      {
-        real sum = 0;
-        real *row_ = a+lda*i;
-        for(j = 0; j < m; j++)
-          sum += x[j*incx]*row_[j];
-	if (beta == 0)
-	  y[i*incy] = alpha*sum;
-	else
-	  y[i*incy] = beta*y[i*incy] + alpha*sum;
-      }
-    }
-    else
-    {
-      if(beta != 1)
-        THBlas_(scal)(m, beta, y, incy);
-
-      for(j = 0; j < n; j++)
-      {
-        real *column_ = a+lda*j;
-        real z = alpha*x[j*incx];
-        for(i = 0; i < m; i++)
-          y[i*incy] += z*column_[i];
-      }
-    }
-  }
-}
-
-void THBlas_(ger)(long m, long n, real alpha, real *x, long incx, real *y, long incy, real *a, long lda)
-{
-  if(n == 1)
-    lda = m;
-
-#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
-  if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX)  && (incx <= INT_MAX) && (incy <= INT_MAX) )
-  {
-    int i_m = (int)m;
-    int i_n = (int)n;
-    int i_lda = (int)lda;
-    int i_incx = (int)incx;
-    int i_incy = (int)incy;
-
-#if defined(TH_REAL_IS_DOUBLE)
-    dger_(&i_m, &i_n, &alpha, x, &i_incx, y, &i_incy, a, &i_lda);
-#else
-    sger_(&i_m, &i_n, &alpha, x, &i_incx, y, &i_incy, a, &i_lda);
-#endif
-    return;
-  }
-#endif
-  {
-    long i, j;
-    for(j = 0; j < n; j++)
-    {
-      real *column_ = a+j*lda;
-      real z = alpha*y[j*incy];
-      for(i = 0; i < m; i++)
-        column_[i] += z*x[i*incx] ;
-    }
-  }
-}
-
-void THBlas_(gemm)(char transa, char transb, long m, long n, long k, real alpha, real *a, long lda, real *b, long ldb, real beta, real *c, long ldc)
-{
-  int transa_ = ((transa == 't') || (transa == 'T'));
-  int transb_ = ((transb == 't') || (transb == 'T'));
-
-  if(n == 1)
-    ldc = m;
-
-  if(transa_)
-  {
-    if(m == 1)
-      lda = k;
-  }
-  else
-  {
-    if(k == 1)
-      lda = m;
-  }
-
-  if(transb_)
-  {
-    if(k == 1)
-      ldb = n;
-  }
-  else
-  {
-    if(n == 1)
-      ldb = k;
-  }
-
-#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT))
-  if( (m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (lda <= INT_MAX)  && (ldb <= INT_MAX) && (ldc <= INT_MAX) )
-  {
-    int i_m = (int)m;
-    int i_n = (int)n;
-    int i_k = (int)k;
-    int i_lda = (int)lda;
-    int i_ldb = (int)ldb;
-    int i_ldc = (int)ldc;
-
-#if defined(TH_REAL_IS_DOUBLE)
-    dgemm_(&transa, &transb, &i_m, &i_n, &i_k, &alpha, a, &i_lda, b, &i_ldb, &beta, c, &i_ldc);
-#else
-    sgemm_(&transa, &transb, &i_m, &i_n, &i_k, &alpha, a, &i_lda, b, &i_ldb, &beta, c, &i_ldc);
-#endif
-    return;
-  }
-#endif
-  {
-    long i, j, l;
-    if(!transa_ && !transb_)
-    {
-      real *a_ = a;
-      for(i = 0; i < m; i++)
-      {
-        real *b_ = b;
-        for(j = 0; j < n; j++)
-        {
-          real sum = 0;
-          for(l = 0; l < k; l++)
-            sum += a_[l*lda]*b_[l];
-          b_ += ldb;
-	  if (beta == 0)
-	    c[j*ldc+i] = alpha*sum;
-	  else
-	    c[j*ldc+i] = beta*c[j*ldc+i]+alpha*sum;
-        }
-        a_++;
-      }
-    }
-    else if(transa_ && !transb_)
-    {
-      real *a_ = a;
-      for(i = 0; i < m; i++)
-      {
-        real *b_ = b;
-        for(j = 0; j < n; j++)
-        {
-          real sum = 0;
-          for(l = 0; l < k; l++)
-            sum += a_[l]*b_[l];
-          b_ += ldb;
-	  if (beta == 0)
-	    c[j*ldc+i] = alpha*sum;
-	  else
-	    c[j*ldc+i] = beta*c[j*ldc+i]+alpha*sum;
-        }
-        a_ += lda;
-      }
-    }
-    else if(!transa_ && transb_)
-    {
-      real *a_ = a;
-      for(i = 0; i < m; i++)
-      {
-        real *b_ = b;
-        for(j = 0; j < n; j++)
-        {
-          real sum = 0;
-          for(l = 0; l < k; l++)
-            sum += a_[l*lda]*b_[l*ldb];
-          b_++;
-	  if (beta == 0)
-	    c[j*ldc+i] = alpha*sum;
-	  else
-	    c[j*ldc+i] = beta*c[j*ldc+i]+alpha*sum;
-        }
-        a_++;
-      }
-    }
-    else
-    {
-      real *a_ = a;
-      for(i = 0; i < m; i++)
-      {
-        real *b_ = b;
-        for(j = 0; j < n; j++)
-        {
-          real sum = 0;
-          for(l = 0; l < k; l++)
-            sum += a_[l]*b_[l*ldb];
-          b_++;
-	  if (beta == 0)
-	    c[j*ldc+i] = alpha*sum;
-	  else
-	    c[j*ldc+i] = beta*c[j*ldc+i]+alpha*sum;
-        }
-        a_ += lda;
-      }
-    }
-  }
-}
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/THBlas.h b/contrib/lua-torch/torch7/lib/TH/generic/THBlas.h
deleted file mode 100644
index 9e14f5a84..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/THBlas.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/THBlas.h"
-#else
-
-/* Level 1 */
-TH_API void THBlas_(swap)(long n, real *x, long incx, real *y, long incy);
-TH_API void THBlas_(scal)(long n, real a, real *x, long incx);
-TH_API void THBlas_(copy)(long n, real *x, long incx, real *y, long incy);
-TH_API void THBlas_(axpy)(long n, real a, real *x, long incx, real *y, long incy);
-TH_API real THBlas_(dot)(long n, real *x, long incx, real *y, long incy);
-
-/* Level 2 */
-TH_API void THBlas_(gemv)(char trans, long m, long n, real alpha, real *a, long lda, real *x, long incx, real beta, real *y, long incy);
-TH_API void THBlas_(ger)(long m, long n, real alpha, real *x, long incx, real *y, long incy, real *a, long lda);
-
-/* Level 3 */
-TH_API void THBlas_(gemm)(char transa, char transb, long m, long n, long k, real alpha, real *a, long lda, real *b, long ldb, real beta, real *c, long ldc);
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/THLapack.c b/contrib/lua-torch/torch7/lib/TH/generic/THLapack.c
deleted file mode 100644
index 148ae26c4..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/THLapack.c
+++ /dev/null
@@ -1,270 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/THLapack.c"
-#else
-
-
-TH_EXTERNC void dgesv_(int *n, int *nrhs, double *a, int *lda, int *ipiv, double *b, int *ldb, int *info);
-TH_EXTERNC void sgesv_(int *n, int *nrhs, float *a, int *lda, int *ipiv, float *b, int *ldb, int *info);
-TH_EXTERNC void dtrtrs_(char *uplo, char *trans, char *diag, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, int *info);
-TH_EXTERNC void strtrs_(char *uplo, char *trans, char *diag, int *n, int *nrhs, float *a, int *lda, float *b, int *ldb, int *info);
-TH_EXTERNC void dgels_(char *trans, int *m, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, double *work, int *lwork, int *info);
-TH_EXTERNC void sgels_(char *trans, int *m, int *n, int *nrhs, float *a, int *lda, float *b, int *ldb, float *work, int *lwork, int *info);
-TH_EXTERNC void dsyev_(char *jobz, char *uplo, int *n, double *a, int *lda, double *w, double *work, int *lwork, int *info);
-TH_EXTERNC void ssyev_(char *jobz, char *uplo, int *n, float *a, int *lda, float *w, float *work, int *lwork, int *info);
-TH_EXTERNC void dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda, double *wr, double *wi, double* vl, int *ldvl, double *vr, int *ldvr, double *work, int *lwork, int *info);
-TH_EXTERNC void sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda, float *wr, float *wi, float* vl, int *ldvl, float *vr, int *ldvr, float *work, int *lwork, int *info);
-TH_EXTERNC void dgesvd_(char *jobu, char *jobvt, int *m, int *n, double *a, int *lda, double *s, double *u, int *ldu, double *vt, int *ldvt, double *work, int *lwork, int *info);
-TH_EXTERNC void sgesvd_(char *jobu, char *jobvt, int *m, int *n, float *a, int *lda, float *s, float *u, int *ldu, float *vt, int *ldvt, float *work, int *lwork, int *info);
-TH_EXTERNC void dgetrf_(int *m, int *n, double *a, int *lda, int *ipiv, int *info);
-TH_EXTERNC void sgetrf_(int *m, int *n, float *a, int *lda, int *ipiv, int *info);
-TH_EXTERNC void dgetrs_(char *trans, int *n, int *nrhs, double *a, int *lda, int *ipiv, double *b, int *ldb, int *info);
-TH_EXTERNC void sgetrs_(char *trans, int *n, int *nrhs, float *a, int *lda, int *ipiv, float *b, int *ldb, int *info);
-TH_EXTERNC void dgetri_(int *n, double *a, int *lda, int *ipiv, double *work, int *lwork, int *info);
-TH_EXTERNC void sgetri_(int *n, float *a, int *lda, int *ipiv, float *work, int *lwork, int *info);
-TH_EXTERNC void dpotrf_(char *uplo, int *n, double *a, int *lda, int *info);
-TH_EXTERNC void spotrf_(char *uplo, int *n, float *a, int *lda, int *info);
-TH_EXTERNC void dpotri_(char *uplo, int *n, double *a, int *lda, int *info);
-TH_EXTERNC void spotri_(char *uplo, int *n, float *a, int *lda, int *info);
-TH_EXTERNC void dpotrs_(char *uplo, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, int *info);
-TH_EXTERNC void spotrs_(char *uplo, int *n, int *nrhs, float *a, int *lda, float *b, int *ldb, int *info);
-TH_EXTERNC void sgeqrf_(int *m, int *n, float *a, int *lda, float *tau, float *work, int *lwork, int *info);
-TH_EXTERNC void dgeqrf_(int *m, int *n, double *a, int *lda, double *tau, double *work, int *lwork, int *info);
-TH_EXTERNC void sorgqr_(int *m, int *n, int *k, float *a, int *lda, float *tau, float *work, int *lwork, int *info);
-TH_EXTERNC void dorgqr_(int *m, int *n, int *k, double *a, int *lda, double *tau, double *work, int *lwork, int *info);
-TH_EXTERNC void sormqr_(char *side, char *trans, int *m, int *n, int *k, float *a, int *lda, float *tau, float *c, int *ldc, float *work, int *lwork, int *info);
-TH_EXTERNC void dormqr_(char *side, char *trans, int *m, int *n, int *k, double *a, int *lda, double *tau, double *c, int *ldc, double *work, int *lwork, int *info);
-TH_EXTERNC void spstrf_(char *uplo, int *n, float *a, int *lda, int *piv, int *rank, float *tol, float *work, int *info);
-TH_EXTERNC void dpstrf_(char *uplo, int *n, double *a, int *lda, int *piv, int *rank, double *tol, double *work, int *info);
-
-
-/* Compute the solution to a real system of linear equations  A * X = B */
-void THLapack_(gesv)(int n, int nrhs, real *a, int lda, int *ipiv, real *b, int ldb, int* info)
-{
-#ifdef USE_LAPACK
-#if defined(TH_REAL_IS_DOUBLE)
-  dgesv_(&n, &nrhs, a, &lda, ipiv, b, &ldb, info);
-#else
-  sgesv_(&n, &nrhs, a, &lda, ipiv, b, &ldb, info);
-#endif
-#else
-  THError("gesv : Lapack library not found in compile time\n");
-#endif
-  return;
-}
-
-/* Solve a triangular system of the form A * X = B  or A^T * X = B */
-void THLapack_(trtrs)(char uplo, char trans, char diag, int n, int nrhs, real *a, int lda, real *b, int ldb, int* info)
-{
-#ifdef USE_LAPACK
-#if defined(TH_REAL_IS_DOUBLE)
-  dtrtrs_(&uplo, &trans, &diag, &n, &nrhs, a, &lda, b, &ldb, info);
-#else
-  strtrs_(&uplo, &trans, &diag, &n, &nrhs, a, &lda, b, &ldb, info);
-#endif
-#else
-  THError("trtrs : Lapack library not found in compile time\n");
-#endif
-  return;
-}
-
-/* Solve overdetermined or underdetermined real linear systems involving an
-M-by-N matrix A, or its transpose, using a QR or LQ factorization of A */
-void THLapack_(gels)(char trans, int m, int n, int nrhs, real *a, int lda, real *b, int ldb, real *work, int lwork, int *info)
-{
-#ifdef USE_LAPACK
-#if defined(TH_REAL_IS_DOUBLE)
-  dgels_(&trans, &m, &n, &nrhs, a, &lda, b, &ldb, work, &lwork, info);
-#else
-  sgels_(&trans, &m, &n, &nrhs, a, &lda, b, &ldb, work, &lwork, info);
-#endif
-#else
-  THError("gels : Lapack library not found in compile time\n");
-#endif
-}
-
-/* Compute all eigenvalues and, optionally, eigenvectors of a real symmetric
-matrix A */
-void THLapack_(syev)(char jobz, char uplo, int n, real *a, int lda, real *w, real *work, int lwork, int *info)
-{
-#ifdef USE_LAPACK
-#if defined(TH_REAL_IS_DOUBLE)
-  dsyev_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, info);
-#else
-  ssyev_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, info);
-#endif
-#else
-  THError("syev : Lapack library not found in compile time\n");
-#endif
-}
-
-/* Compute for an N-by-N real nonsymmetric matrix A, the eigenvalues and,
-optionally, the left and/or right eigenvectors */
-void THLapack_(geev)(char jobvl, char jobvr, int n, real *a, int lda, real *wr, real *wi, real* vl, int ldvl, real *vr, int ldvr, real *work, int lwork, int *info)
-{
-#ifdef USE_LAPACK
-#if defined(TH_REAL_IS_DOUBLE)
-  dgeev_(&jobvl, &jobvr, &n, a, &lda, wr, wi, vl, &ldvl, vr, &ldvr, work, &lwork, info);
-#else
-  sgeev_(&jobvl, &jobvr, &n, a, &lda, wr, wi, vl, &ldvl, vr, &ldvr, work, &lwork, info);
-#endif
-#else
-  THError("geev : Lapack library not found in compile time\n");
-#endif
-}
-
-/* Compute the singular value decomposition (SVD) of a real M-by-N matrix A,
-optionally computing the left and/or right singular vectors */
-void THLapack_(gesvd)(char jobu, char jobvt, int m, int n, real *a, int lda, real *s, real *u, int ldu, real *vt, int ldvt, real *work, int lwork, int *info)
-{
-#ifdef USE_LAPACK
-#if defined(TH_REAL_IS_DOUBLE)
-  dgesvd_( &jobu,  &jobvt,  &m,  &n,  a,  &lda,  s,  u,  &ldu,  vt,  &ldvt,  work,  &lwork,  info);
-#else
-  sgesvd_( &jobu,  &jobvt,  &m,  &n,  a,  &lda,  s,  u,  &ldu,  vt,  &ldvt,  work,  &lwork,  info);
-#endif
-#else
-  THError("gesvd : Lapack library not found in compile time\n");
-#endif
-}
-
-/* LU decomposition */
-void THLapack_(getrf)(int m, int n, real *a, int lda, int *ipiv, int *info)
-{
-#ifdef  USE_LAPACK
-#if defined(TH_REAL_IS_DOUBLE)
-  dgetrf_(&m, &n, a, &lda, ipiv, info);
-#else
-  sgetrf_(&m, &n, a, &lda, ipiv, info);
-#endif
-#else
-  THError("getrf : Lapack library not found in compile time\n");
-#endif
-}
-
-void THLapack_(getrs)(char trans, int n, int nrhs, real *a, int lda, int *ipiv, real *b, int ldb, int *info)
-{
-#ifdef  USE_LAPACK
-#if defined(TH_REAL_IS_DOUBLE)
-  dgetrs_(&trans, &n, &nrhs, a, &lda, ipiv, b, &ldb, info);
-#else
-  sgetrs_(&trans, &n, &nrhs, a, &lda, ipiv, b, &ldb, info);
-#endif
-#else
-  THError("getrs : Lapack library not found in compile time\n");
-#endif
-}
-
-/* Matrix Inverse */
-void THLapack_(getri)(int n, real *a, int lda, int *ipiv, real *work, int lwork, int* info)
-{
-#ifdef  USE_LAPACK
-#if defined(TH_REAL_IS_DOUBLE)
-  dgetri_(&n, a, &lda, ipiv, work, &lwork, info);
-#else
-  sgetri_(&n, a, &lda, ipiv, work, &lwork, info);
-#endif
-#else
-  THError("getri : Lapack library not found in compile time\n");
-#endif
-}
-
-/* Cholesky factorization */
-void THLapack_(potrf)(char uplo, int n, real *a, int lda, int *info)
-{
-#ifdef  USE_LAPACK
-#if defined(TH_REAL_IS_DOUBLE)
-  dpotrf_(&uplo, &n, a, &lda, info);
-#else
-  spotrf_(&uplo, &n, a, &lda, info);
-#endif
-#else
-  THError("potrf : Lapack library not found in compile time\n");
-#endif
-}
-
-/* Solve A*X = B with a symmetric positive definite matrix A using the Cholesky factorization */
-void THLapack_(potrs)(char uplo, int n, int nrhs, real *a, int lda, real *b, int ldb, int *info)
-{
-#ifdef  USE_LAPACK
-#if defined(TH_REAL_IS_DOUBLE)
-  dpotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
-#else
-  spotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
-#endif
-#else
-  THError("potrs: Lapack library not found in compile time\n");
-#endif
-}
-
-/* Cholesky factorization based Matrix Inverse */
-void THLapack_(potri)(char uplo, int n, real *a, int lda, int *info)
-{
-#ifdef  USE_LAPACK
-#if defined(TH_REAL_IS_DOUBLE)
-  dpotri_(&uplo, &n, a, &lda, info);
-#else
-  spotri_(&uplo, &n, a, &lda, info);
-#endif
-#else
-  THError("potri: Lapack library not found in compile time\n");
-#endif
-}
-
-/* Cholesky factorization with complete pivoting */
-void THLapack_(pstrf)(char uplo, int n, real *a, int lda, int *piv, int *rank, real tol, real *work, int *info)
-{
-#ifdef  USE_LAPACK
-#if defined(TH_REAL_IS_DOUBLE)
-  dpstrf_(&uplo, &n, a, &lda, piv, rank, &tol, work, info);
-#else
-  spstrf_(&uplo, &n, a, &lda, piv, rank, &tol, work, info);
-#endif
-#else
-  THError("pstrf: Lapack library not found at compile time\n");
-#endif
-}
-
-/* QR decomposition */
-void THLapack_(geqrf)(int m, int n, real *a, int lda, real *tau, real *work, int lwork, int *info)
-{
-#ifdef  USE_LAPACK
-#if defined(TH_REAL_IS_DOUBLE)
-  dgeqrf_(&m, &n, a, &lda, tau, work, &lwork, info);
-#else
-  sgeqrf_(&m, &n, a, &lda, tau, work, &lwork, info);
-#endif
-#else
-  THError("geqrf: Lapack library not found in compile time\n");
-#endif
-}
-
-/* Build Q from output of geqrf */
-void THLapack_(orgqr)(int m, int n, int k, real *a, int lda, real *tau, real *work, int lwork, int *info)
-{
-#ifdef  USE_LAPACK
-#if defined(TH_REAL_IS_DOUBLE)
-  dorgqr_(&m, &n, &k, a, &lda, tau, work, &lwork, info);
-#else
-  sorgqr_(&m, &n, &k, a, &lda, tau, work, &lwork, info);
-#endif
-#else
-  THError("orgqr: Lapack library not found in compile time\n");
-#endif
-}
-
-/* Multiply Q with a matrix using the output of geqrf */
-void THLapack_(ormqr)(char side, char trans, int m, int n, int k, real *a, int lda, real *tau, real *c, int ldc, real *work, int lwork, int *info)
-{
-#ifdef  USE_LAPACK
-#if defined(TH_REAL_IS_DOUBLE)
-  dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, &lwork, info);
-#else
-  sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, &lwork, info);
-#endif
-#else
-  THError("ormqr: Lapack library not found in compile time\n");
-#endif
-}
-
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/THLapack.h b/contrib/lua-torch/torch7/lib/TH/generic/THLapack.h
deleted file mode 100644
index b464dd2d2..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/THLapack.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/THLapack.h"
-#else
-
-/* AX=B */
-TH_API void THLapack_(gesv)(int n, int nrhs, real *a, int lda, int *ipiv, real *b, int ldb, int* info);
-/* Solve a triangular system of the form A * X = B  or A^T * X = B */
-TH_API void THLapack_(trtrs)(char uplo, char trans, char diag, int n, int nrhs, real *a, int lda, real *b, int ldb, int* info);
-/* ||AX-B|| */
-TH_API void THLapack_(gels)(char trans, int m, int n, int nrhs, real *a, int lda, real *b, int ldb, real *work, int lwork, int *info);
-/* Eigenvals */
-TH_API void THLapack_(syev)(char jobz, char uplo, int n, real *a, int lda, real *w, real *work, int lwork, int *info);
-/* Non-sym eigenvals */
-TH_API void THLapack_(geev)(char jobvl, char jobvr, int n, real *a, int lda, real *wr, real *wi, real* vl, int ldvl, real *vr, int ldvr, real *work, int lwork, int *info);
-/* svd */
-TH_API void THLapack_(gesvd)(char jobu, char jobvt, int m, int n, real *a, int lda, real *s, real *u, int ldu, real *vt, int ldvt, real *work, int lwork, int *info);
-/* LU decomposition */
-TH_API void THLapack_(getrf)(int m, int n, real *a, int lda, int *ipiv, int *info);
-TH_API void THLapack_(getrs)(char trans, int n, int nrhs, real *a, int lda, int *ipiv, real *b, int ldb, int *info);
-/* Matrix Inverse */
-TH_API void THLapack_(getri)(int n, real *a, int lda, int *ipiv, real *work, int lwork, int* info);
-
-/* Positive Definite matrices */
-/* Cholesky factorization */
-void THLapack_(potrf)(char uplo, int n, real *a, int lda, int *info);
-/* Matrix inverse based on Cholesky factorization */
-void THLapack_(potri)(char uplo, int n, real *a, int lda, int *info);
-/* Solve A*X = B with a symmetric positive definite matrix A using the Cholesky factorization */
-void THLapack_(potrs)(char uplo, int n, int nrhs, real *a, int lda, real *b, int ldb, int *info);
-/* Cholesky factorization with complete pivoting. */
-void THLapack_(pstrf)(char uplo, int n, real *a, int lda, int *piv, int *rank, real tol, real *work, int *info);
-
-/* QR decomposition */
-void THLapack_(geqrf)(int m, int n, real *a, int lda, real *tau, real *work, int lwork, int *info);
-/* Build Q from output of geqrf */
-void THLapack_(orgqr)(int m, int n, int k, real *a, int lda, real *tau, real *work, int lwork, int *info);
-/* Multiply Q with a matrix from output of geqrf */
-void THLapack_(ormqr)(char side, char trans, int m, int n, int k, real *a, int lda, real *tau, real *c, int ldc, real *work, int lwork, int *info);
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/THStorage.c b/contrib/lua-torch/torch7/lib/TH/generic/THStorage.c
deleted file mode 100644
index a592cfb62..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/THStorage.c
+++ /dev/null
@@ -1,226 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/THStorage.c"
-#else
-
-real* THStorage_(data)(const THStorage *self)
-{
-  return self->data;
-}
-
-ptrdiff_t THStorage_(size)(const THStorage *self)
-{
-  return self->size;
-}
-
-size_t THStorage_(elementSize)()
-{
-  return sizeof(real);
-}
-
-THStorage* THStorage_(new)(void)
-{
-  return THStorage_(newWithSize)(0);
-}
-
-THStorage* THStorage_(newWithSize)(ptrdiff_t size)
-{
-  return THStorage_(newWithAllocator)(size, &THDefaultAllocator, NULL);
-}
-
-THStorage* THStorage_(newWithAllocator)(ptrdiff_t size,
-                                        THAllocator *allocator,
-                                        void *allocatorContext)
-{
-  THStorage *storage = THAlloc(sizeof(THStorage));
-  storage->data = allocator->malloc(allocatorContext, sizeof(real)*size);
-  storage->size = size;
-  storage->refcount = 1;
-  storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE | TH_STORAGE_FREEMEM;
-  storage->allocator = allocator;
-  storage->allocatorContext = allocatorContext;
-  return storage;
-}
-
-THStorage* THStorage_(newWithMapping)(const char *filename, ptrdiff_t size, int flags)
-{
-  THMapAllocatorContext *ctx = THMapAllocatorContext_new(filename, flags);
-
-  THStorage *storage = THStorage_(newWithAllocator)(size,
-                                                    &THMapAllocator,
-                                                    ctx);
-
-  if(size <= 0)
-    storage->size = THMapAllocatorContext_size(ctx)/sizeof(real);
-
-  THStorage_(clearFlag)(storage, TH_STORAGE_RESIZABLE);
-
-  return storage;
-}
-
-THStorage* THStorage_(newWithSize1)(real data0)
-{
-  THStorage *self = THStorage_(newWithSize)(1);
-  self->data[0] = data0;
-  return self;
-}
-
-THStorage* THStorage_(newWithSize2)(real data0, real data1)
-{
-  THStorage *self = THStorage_(newWithSize)(2);
-  self->data[0] = data0;
-  self->data[1] = data1;
-  return self;
-}
-
-THStorage* THStorage_(newWithSize3)(real data0, real data1, real data2)
-{
-  THStorage *self = THStorage_(newWithSize)(3);
-  self->data[0] = data0;
-  self->data[1] = data1;
-  self->data[2] = data2;
-  return self;
-}
-
-THStorage* THStorage_(newWithSize4)(real data0, real data1, real data2, real data3)
-{
-  THStorage *self = THStorage_(newWithSize)(4);
-  self->data[0] = data0;
-  self->data[1] = data1;
-  self->data[2] = data2;
-  self->data[3] = data3;
-  return self;
-}
-
-void THStorage_(setFlag)(THStorage *storage, const char flag)
-{
-  storage->flag |= flag;
-}
-
-void THStorage_(clearFlag)(THStorage *storage, const char flag)
-{
-  storage->flag &= ~flag;
-}
-
-void THStorage_(retain)(THStorage *storage)
-{
-  if(storage && (storage->flag & TH_STORAGE_REFCOUNTED))
-    THAtomicIncrementRef(&storage->refcount);
-}
-
-void THStorage_(free)(THStorage *storage)
-{
-  if(!storage)
-    return;
-
-  if((storage->flag & TH_STORAGE_REFCOUNTED) && (THAtomicGet(&storage->refcount) > 0))
-  {
-    if(THAtomicDecrementRef(&storage->refcount))
-    {
-      if(storage->flag & TH_STORAGE_FREEMEM) {
-        storage->allocator->free(storage->allocatorContext, storage->data);
-      }
-      if(storage->flag & TH_STORAGE_VIEW) {
-        THStorage_(free)(storage->view);
-      }
-      THFree(storage);
-    }
-  }
-}
-
-THStorage* THStorage_(newWithData)(real *data, ptrdiff_t size)
-{
-  return THStorage_(newWithDataAndAllocator)(data, size,
-                                             &THDefaultAllocator, NULL);
-}
-
-THStorage* THStorage_(newWithDataAndAllocator)(real* data, ptrdiff_t size,
-                                               THAllocator* allocator,
-                                               void* allocatorContext) {
-  THStorage *storage = THAlloc(sizeof(THStorage));
-  storage->data = data;
-  storage->size = size;
-  storage->refcount = 1;
-  storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE | TH_STORAGE_FREEMEM;
-  storage->allocator = allocator;
-  storage->allocatorContext = allocatorContext;
-  return storage;
-}
-
-void THStorage_(resize)(THStorage *storage, ptrdiff_t size)
-{
-  if(storage->flag & TH_STORAGE_RESIZABLE)
-  {
-    if(storage->allocator->realloc == NULL) {
-      /* case when the allocator does not have a realloc defined */
-      real *old_data = storage->data;
-      ptrdiff_t old_size = storage->size;
-      if (size == 0) {
-	storage->data = NULL;
-      } else {
-	storage->data = storage->allocator->malloc(
-						   storage->allocatorContext,
-						   sizeof(real)*size);
-      }
-      storage->size = size;
-      if (old_data != NULL) {
-	ptrdiff_t copy_size = old_size;
-	if (storage->size < copy_size) {
-	  copy_size = storage->size;
-	}
-	if (copy_size > 0) {
-	  memcpy(storage->data, old_data, sizeof(real)*copy_size);
-	}
-	storage->allocator->free(storage->allocatorContext, old_data);
-      }
-    } else {
-      storage->data = storage->allocator->realloc(
-						  storage->allocatorContext,
-						  storage->data,
-						  sizeof(real)*size);
-      storage->size = size;
-    }
-  } else {
-    THError("Trying to resize storage that is not resizable");
-  }
-}
-
-void THStorage_(fill)(THStorage *storage, real value)
-{
-  ptrdiff_t i;
-  for(i = 0; i < storage->size; i++)
-    storage->data[i] = value;
-}
-
-void THStorage_(set)(THStorage *self, ptrdiff_t idx, real value)
-{
-  THArgCheck((idx >= 0) && (idx < self->size), 2, "out of bounds");
-  self->data[idx] = value;
-}
-
-real THStorage_(get)(const THStorage *self, ptrdiff_t idx)
-{
-  THArgCheck((idx >= 0) && (idx < self->size), 2, "out of bounds");
-  return self->data[idx];
-}
-
-void THStorage_(swap)(THStorage *storage1, THStorage *storage2)
-{
-#define SWAP(val) { val = storage1->val; storage1->val = storage2->val; storage2->val = val; }
-    real *data;
-    ptrdiff_t size;
-    char flag;
-    THAllocator *allocator;
-    void *allocatorContext;
-    struct THStorage *view;
-
-    SWAP(data);
-    SWAP(size);
-    SWAP(flag);
-    // don't swap refcount!
-    SWAP(allocator);
-    SWAP(allocatorContext);
-    SWAP(view);
-#undef SWAP
-}
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/THStorage.h b/contrib/lua-torch/torch7/lib/TH/generic/THStorage.h
deleted file mode 100644
index 3dd214b33..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/THStorage.h
+++ /dev/null
@@ -1,71 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/THStorage.h"
-#else
-
-/* on pourrait avoir un liste chainee
-   qui initialise math, lab structures (or more).
-   mouais -- complique.
-
-   Pb: THMapStorage is kind of a class
-   THLab_()... comment je m'en sors?
-
-   en template, faudrait que je les instancie toutes!!! oh boy!
-   Et comment je sais que c'est pour Cuda? Le type float est le meme dans les <>
-
-   au bout du compte, ca serait sur des pointeurs float/double... etc... = facile.
-   primitives??
- */
-
-#define TH_STORAGE_REFCOUNTED 1
-#define TH_STORAGE_RESIZABLE  2
-#define TH_STORAGE_FREEMEM    4
-#define TH_STORAGE_VIEW       8
-
-typedef struct THStorage
-{
-    real *data;
-    ptrdiff_t size;
-    int refcount;
-    char flag;
-    THAllocator *allocator;
-    void *allocatorContext;
-    struct THStorage *view;
-} THStorage;
-
-TH_API real* THStorage_(data)(const THStorage*);
-TH_API ptrdiff_t THStorage_(size)(const THStorage*);
-TH_API size_t THStorage_(elementSize)(void);
-
-/* slow access -- checks everything */
-TH_API void THStorage_(set)(THStorage*, ptrdiff_t, real);
-TH_API real THStorage_(get)(const THStorage*, ptrdiff_t);
-
-TH_API THStorage* THStorage_(new)(void);
-TH_API THStorage* THStorage_(newWithSize)(ptrdiff_t size);
-TH_API THStorage* THStorage_(newWithSize1)(real);
-TH_API THStorage* THStorage_(newWithSize2)(real, real);
-TH_API THStorage* THStorage_(newWithSize3)(real, real, real);
-TH_API THStorage* THStorage_(newWithSize4)(real, real, real, real);
-TH_API THStorage* THStorage_(newWithMapping)(const char *filename, ptrdiff_t size, int flags);
-
-/* takes ownership of data */
-TH_API THStorage* THStorage_(newWithData)(real *data, ptrdiff_t size);
-
-TH_API THStorage* THStorage_(newWithAllocator)(ptrdiff_t size,
-                                               THAllocator* allocator,
-                                               void *allocatorContext);
-TH_API THStorage* THStorage_(newWithDataAndAllocator)(
-    real* data, ptrdiff_t size, THAllocator* allocator, void *allocatorContext);
-
-/* should not differ with API */
-TH_API void THStorage_(setFlag)(THStorage *storage, const char flag);
-TH_API void THStorage_(clearFlag)(THStorage *storage, const char flag);
-TH_API void THStorage_(retain)(THStorage *storage);
-TH_API void THStorage_(swap)(THStorage *storage1, THStorage *storage2);
-
-/* might differ with other API (like CUDA) */
-TH_API void THStorage_(free)(THStorage *storage);
-TH_API void THStorage_(resize)(THStorage *storage, ptrdiff_t size);
-TH_API void THStorage_(fill)(THStorage *storage, real value);
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/THStorageCopy.c b/contrib/lua-torch/torch7/lib/TH/generic/THStorageCopy.c
deleted file mode 100644
index ce4b57eaf..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/THStorageCopy.c
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/THStorageCopy.c"
-#else
-
-void THStorage_(rawCopy)(THStorage *storage, real *src)
-{
-  ptrdiff_t i;
-  for(i = 0; i < storage->size; i++)
-    storage->data[i] = src[i];
-}
-
-void THStorage_(copy)(THStorage *storage, THStorage *src)
-{
-  THArgCheck(storage->size == src->size, 2, "size mismatch");
-  THStorage_(rawCopy)(storage, src->data);
-}
-
-#define IMPLEMENT_THStorage_COPY(TYPENAMESRC) \
-void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
-{ \
-  ptrdiff_t i;                                                        \
-  for(i = 0; i < storage->size; i++)                                  \
-    storage->data[i] = (real)src->data[i];                            \
-}
-
-#define IMPLEMENT_THStorage_COPY_FROM_HALF(TYPENAMESRC)		\
-void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
-{ \
-  THArgCheck(storage->size == src->size, 2, "size mismatch"); \
-  ptrdiff_t i;								\
-  for(i = 0; i < storage->size; i++)					\
-    storage->data[i] = (real)TH_half2float(src->data[i]);		\
-}
-
-#define IMPLEMENT_THStorage_COPY_TO_HALF(TYPENAMESRC)		\
-void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
-{ \
-  THArgCheck(storage->size == src->size, 2, "size mismatch"); \
-  ptrdiff_t i;								\
-  for(i = 0; i < storage->size; i++)					\
-    storage->data[i] = TH_float2half((float)(src->data[i]));		\
-}
-
-#define IMPLEMENT_THStorage_COPY_TO_FROM_HALF(TYPENAMESRC)		\
-void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \
-{ \
-  THArgCheck(storage->size == src->size, 2, "size mismatch"); \
-  ptrdiff_t i;								\
-  for(i = 0; i < storage->size; i++)					\
-    storage->data[i] = src->data[i];		\
-}
-
-#ifndef TH_REAL_IS_HALF
-IMPLEMENT_THStorage_COPY(Byte)
-IMPLEMENT_THStorage_COPY(Char)
-IMPLEMENT_THStorage_COPY(Short)
-IMPLEMENT_THStorage_COPY(Int)
-IMPLEMENT_THStorage_COPY(Long)
-IMPLEMENT_THStorage_COPY(Float)
-IMPLEMENT_THStorage_COPY(Double)
-IMPLEMENT_THStorage_COPY_FROM_HALF(Half)
-#else
-/* only allow pass-through for Half */
-IMPLEMENT_THStorage_COPY_TO_FROM_HALF(Half)
-IMPLEMENT_THStorage_COPY_TO_HALF(Byte)
-IMPLEMENT_THStorage_COPY_TO_HALF(Char)
-IMPLEMENT_THStorage_COPY_TO_HALF(Short)
-IMPLEMENT_THStorage_COPY_TO_HALF(Int)
-IMPLEMENT_THStorage_COPY_TO_HALF(Long)
-IMPLEMENT_THStorage_COPY_TO_HALF(Float)
-IMPLEMENT_THStorage_COPY_TO_HALF(Double)
-#endif
-
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/THStorageCopy.h b/contrib/lua-torch/torch7/lib/TH/generic/THStorageCopy.h
deleted file mode 100644
index ce8a2a690..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/THStorageCopy.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/THStorageCopy.h"
-#else
-
-/* Support for copy between different Storage types */
-
-TH_API void THStorage_(rawCopy)(THStorage *storage, real *src);
-TH_API void THStorage_(copy)(THStorage *storage, THStorage *src);
-TH_API void THStorage_(copyByte)(THStorage *storage, struct THByteStorage *src);
-TH_API void THStorage_(copyChar)(THStorage *storage, struct THCharStorage *src);
-TH_API void THStorage_(copyShort)(THStorage *storage, struct THShortStorage *src);
-TH_API void THStorage_(copyInt)(THStorage *storage, struct THIntStorage *src);
-TH_API void THStorage_(copyLong)(THStorage *storage, struct THLongStorage *src);
-TH_API void THStorage_(copyFloat)(THStorage *storage, struct THFloatStorage *src);
-TH_API void THStorage_(copyDouble)(THStorage *storage, struct THDoubleStorage *src);
-TH_API void THStorage_(copyHalf)(THStorage *storage, struct THHalfStorage *src);
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/THTensor.c b/contrib/lua-torch/torch7/lib/TH/generic/THTensor.c
deleted file mode 100644
index e44e06ec3..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/THTensor.c
+++ /dev/null
@@ -1,939 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/THTensor.c"
-#else
-
-/**** access methods ****/
-THStorage *THTensor_(storage)(const THTensor *self)
-{
-  return self->storage;
-}
-
-ptrdiff_t THTensor_(storageOffset)(const THTensor *self)
-{
-  return self->storageOffset;
-}
-
-int THTensor_(nDimension)(const THTensor *self)
-{
-  return self->nDimension;
-}
-
-long THTensor_(size)(const THTensor *self, int dim)
-{
-  THArgCheck((dim >= 0) && (dim < self->nDimension), 2, "dimension %d out of range of %dD tensor",
-      dim+TH_INDEX_BASE, THTensor_(nDimension)(self));
-  return self->size[dim];
-}
-
-long THTensor_(stride)(const THTensor *self, int dim)
-{
-  THArgCheck((dim >= 0) && (dim < self->nDimension), 2, "dimension %d out of range of %dD tensor",
-      dim+TH_INDEX_BASE, THTensor_(nDimension)(self));
-  return self->stride[dim];
-}
-
-THLongStorage *THTensor_(newSizeOf)(THTensor *self)
-{
-  THLongStorage *size = THLongStorage_newWithSize(self->nDimension);
-  THLongStorage_rawCopy(size, self->size);
-  return size;
-}
-
-THLongStorage *THTensor_(newStrideOf)(THTensor *self)
-{
-  THLongStorage *stride = THLongStorage_newWithSize(self->nDimension);
-  THLongStorage_rawCopy(stride, self->stride);
-  return stride;
-}
-
-real *THTensor_(data)(const THTensor *self)
-{
-  if(self->storage)
-    return (self->storage->data+self->storageOffset);
-  else
-    return NULL;
-}
-
-void THTensor_(setFlag)(THTensor *self, const char flag)
-{
-  self->flag |= flag;
-}
-
-void THTensor_(clearFlag)(THTensor *self, const char flag)
-{
-  self->flag &= ~flag;
-}
-
-/**** creation methods ****/
-
-static void THTensor_(rawInit)(THTensor *self);
-
-
-/* Empty init */
-THTensor *THTensor_(new)(void)
-{
-  THTensor *self = THAlloc(sizeof(THTensor));
-  THTensor_(rawInit)(self);
-  return self;
-}
-
-/* Pointer-copy init */
-THTensor *THTensor_(newWithTensor)(THTensor *tensor)
-{
-  THTensor *self = THAlloc(sizeof(THTensor));
-  THTensor_(rawInit)(self);
-  THTensor_(setStorageNd)(self,
-                          tensor->storage,
-                          tensor->storageOffset,
-                          tensor->nDimension,
-                          tensor->size,
-                          tensor->stride);
-  return self;
-}
-
-/* Storage init */
-THTensor *THTensor_(newWithStorage)(THStorage *storage, ptrdiff_t storageOffset, THLongStorage *size, THLongStorage *stride)
-{
-  THTensor *self = THAlloc(sizeof(THTensor));
-  if(size && stride)
-    THArgCheck(size->size == stride->size, 4, "inconsistent size");
-
-  THTensor_(rawInit)(self);
-#ifdef DEBUG
-  THAssert((size ? size->size : (stride ? stride->size : 0)) <= INT_MAX);
-#endif
-  THTensor_(setStorageNd)(self,
-                          storage,
-                          storageOffset,
-                          (size ? size->size : (stride ? stride->size : 0)),
-                          (size ? size->data : NULL),
-                          (stride ? stride->data : NULL));
-
-  return self;
-}
-THTensor *THTensor_(newWithStorage1d)(THStorage *storage, ptrdiff_t storageOffset,
-                               long size0, long stride0)
-{
-  return THTensor_(newWithStorage4d)(storage, storageOffset, size0, stride0, -1, -1,  -1, -1,  -1, -1);
-}
-
-THTensor *THTensor_(newWithStorage2d)(THStorage *storage, ptrdiff_t storageOffset,
-                               long size0, long stride0,
-                               long size1, long stride1)
-{
-  return THTensor_(newWithStorage4d)(storage, storageOffset, size0, stride0, size1, stride1,  -1, -1,  -1, -1);
-}
-
-THTensor *THTensor_(newWithStorage3d)(THStorage *storage, ptrdiff_t storageOffset,
-                               long size0, long stride0,
-                               long size1, long stride1,
-                               long size2, long stride2)
-{
-  return THTensor_(newWithStorage4d)(storage, storageOffset, size0, stride0, size1, stride1,  size2, stride2,  -1, -1);
-}
-
-THTensor *THTensor_(newWithStorage4d)(THStorage *storage, ptrdiff_t storageOffset,
-                               long size0, long stride0,
-                               long size1, long stride1,
-                               long size2, long stride2,
-                               long size3, long stride3)
-{
-  long size[4] = {size0, size1, size2, size3};
-  long stride[4] = {stride0, stride1, stride2, stride3};
-
-  THTensor *self = THAlloc(sizeof(THTensor));
-  THTensor_(rawInit)(self);
-  THTensor_(setStorageNd)(self, storage, storageOffset, 4, size, stride);
-
-  return self;
-}
-
-THTensor *THTensor_(newWithSize)(THLongStorage *size, THLongStorage *stride)
-{
-  return THTensor_(newWithStorage)(NULL, 0, size, stride);
-}
-
-THTensor *THTensor_(newWithSize1d)(long size0)
-{
-  return THTensor_(newWithSize4d)(size0, -1, -1, -1);
-}
-
-THTensor *THTensor_(newWithSize2d)(long size0, long size1)
-{
-  return THTensor_(newWithSize4d)(size0, size1, -1, -1);
-}
-
-THTensor *THTensor_(newWithSize3d)(long size0, long size1, long size2)
-{
-  return THTensor_(newWithSize4d)(size0, size1, size2, -1);
-}
-
-THTensor *THTensor_(newWithSize4d)(long size0, long size1, long size2, long size3)
-{
-  long size[4] = {size0, size1, size2, size3};
-
-  THTensor *self = THAlloc(sizeof(THTensor));
-  THTensor_(rawInit)(self);
-  THTensor_(resizeNd)(self, 4, size, NULL);
-
-  return self;
-}
-
-THTensor *THTensor_(newClone)(THTensor *self)
-{
-  THTensor *tensor = THTensor_(new)();
-  THTensor_(resizeAs)(tensor, self);
-  THTensor_(copy)(tensor, self);
-  return tensor;
-}
-
-THTensor *THTensor_(newContiguous)(THTensor *self)
-{
-  if(!THTensor_(isContiguous)(self))
-    return THTensor_(newClone)(self);
-  else
-  {
-    THTensor_(retain)(self);
-    return self;
-  }
-}
-
-THTensor *THTensor_(newSelect)(THTensor *tensor, int dimension_, long sliceIndex_)
-{
-  THTensor *self = THTensor_(newWithTensor)(tensor);
-  THTensor_(select)(self, NULL, dimension_, sliceIndex_);
-  return self;
-}
-
-THTensor *THTensor_(newNarrow)(THTensor *tensor, int dimension_, long firstIndex_, long size_)
-{
-  THTensor *self = THTensor_(newWithTensor)(tensor);
-  THTensor_(narrow)(self, NULL, dimension_, firstIndex_, size_);
-  return self;
-}
-
-THTensor *THTensor_(newTranspose)(THTensor *tensor, int dimension1_, int dimension2_)
-{
-  THTensor *self = THTensor_(newWithTensor)(tensor);
-  THTensor_(transpose)(self, NULL, dimension1_, dimension2_);
-  return self;
-}
-
-THTensor *THTensor_(newUnfold)(THTensor *tensor, int dimension_, long size_, long step_)
-{
-  THTensor *self = THTensor_(newWithTensor)(tensor);
-  THTensor_(unfold)(self, NULL, dimension_, size_, step_);
-  return self;
-}
-
-THTensor *THTensor_(newView)(THTensor *tensor, THLongStorage *size)
-{
-  THArgCheck(THTensor_(isContiguous)(tensor), 1, "input is not contiguous");
-  ptrdiff_t numel = THTensor_(nElement)(tensor);
-  THTensor *self = THTensor_(new)();
-  THLongStorage *inferred_size = THLongStorage_newInferSize(size, numel);
-  THTensor_(setStorage)(self, tensor->storage, tensor->storageOffset, inferred_size, NULL);
-  THLongStorage_free(inferred_size);
-  return self;
-}
-
-/* Resize */
-void THTensor_(resize)(THTensor *self, THLongStorage *size, THLongStorage *stride)
-{
-  THArgCheck(size != NULL, 2, "invalid size");
-  if(stride)
-    THArgCheck(stride->size == size->size, 3, "invalid stride");
-
-#ifdef DEBUG
-  THAssert(size->size <= INT_MAX);
-#endif
-  THTensor_(resizeNd)(self, size->size, size->data, (stride ? stride->data : NULL));
-}
-
-void THTensor_(resizeAs)(THTensor *self, THTensor *src)
-{
-  if(!THTensor_(isSameSizeAs)(self, src))
-    THTensor_(resizeNd)(self, src->nDimension, src->size, NULL);
-}
-
-void THTensor_(resize1d)(THTensor *tensor, long size0)
-{
-  THTensor_(resize4d)(tensor, size0, -1, -1, -1);
-}
-
-void THTensor_(resize2d)(THTensor *tensor, long size0, long size1)
-{
-  THTensor_(resize4d)(tensor, size0, size1, -1, -1);
-}
-
-void THTensor_(resize3d)(THTensor *tensor, long size0, long size1, long size2)
-{
-  THTensor_(resize4d)(tensor, size0, size1, size2, -1);
-}
-
-void THTensor_(resize4d)(THTensor *self, long size0, long size1, long size2, long size3)
-{
-  long size[4] = {size0, size1, size2, size3};
-
-  THTensor_(resizeNd)(self, 4, size, NULL);
-}
-
-void THTensor_(resize5d)(THTensor *self, long size0, long size1, long size2, long size3, long size4)
-{
-    long size[5] = {size0, size1, size2, size3, size4};
-
-  THTensor_(resizeNd)(self, 5, size, NULL);
-}
-
-THTensor* THTensor_(newExpand)(THTensor *tensor, THLongStorage *sizes) {
-  THTensor *result = THTensor_(new)();
-  THTensor_(expand)(result, tensor, sizes);
-  return result;
-}
-
-void THTensor_(expand)(THTensor *r, THTensor *tensor, THLongStorage *sizes) {
-  THArgCheck(THTensor_(nDimension)(tensor) > 0, 0, "can't expand an empty tensor");
-  THArgCheck(THLongStorage_size(sizes) >= THTensor_(nDimension)(tensor), 1,
-             "the number of sizes provided must be greater or equal to the "
-             "number of dimensions in the tensor");
-
-  long *expandedSizes;
-  long *expandedStrides;
-  char error_buffer[1024];
-  int ret =
-      THLongStorage_inferExpandGeometry(tensor->size, tensor->stride, THTensor_(nDimension)(tensor),
-                                        sizes, &expandedSizes, &expandedStrides, error_buffer, 1024);
-
-  if (ret != 0) {
-    THError(error_buffer);
-    return;
-  }
-
-  THTensor_(setStorageNd)(r, THTensor_(storage)(tensor), THTensor_(storageOffset)(tensor),
-                          THLongStorage_size(sizes), expandedSizes, expandedStrides);
-  THFree(expandedSizes);
-  THFree(expandedStrides);
-}
-
-
-void THTensor_(expandNd)(THTensor **rets, THTensor **ops, int count) {
-  for (int i = 0; i < count; ++i) {
-    THArgCheck(THTensor_(nDimension)(ops[i]) > 0, i, "can't expand empty tensor %d", i);
-  }
-
-  long *op_sizes[count];
-  long op_dims[count];
-
-  for (int i = 0; i < count; ++i) {
-    op_sizes[i] = ops[i]->size;
-    op_dims[i] = ops[i]->nDimension;
-  }
-
-  THLongStorage *sizes = THLongStorage_new();
-  char error_buffer[1024];
-  int ret = THLongStorage_inferSizeN(sizes,
-                                     count,
-                                     op_sizes,
-                                     op_dims,
-                                     error_buffer,
-                                     1024);
-
-  if(ret != 0) {
-    THLongStorage_free(sizes);
-    THError(error_buffer);
-    return;
-  }
-
-  for (int i = 0; i < count; ++i) {
-    THTensor_(expand)(rets[i], ops[i], sizes);
-  }
-
-  THLongStorage_free(sizes);
-}
-
-void THTensor_(set)(THTensor *self, THTensor *src)
-{
-  if(self != src)
-    THTensor_(setStorageNd)(self,
-                            src->storage,
-                            src->storageOffset,
-                            src->nDimension,
-                            src->size,
-                            src->stride);
-}
-
-void THTensor_(setStorage)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_)
-{
-  if(size_ && stride_)
-    THArgCheck(size_->size == stride_->size, 5, "inconsistent size/stride sizes");
-
-#ifdef DEBUG
-  THAssert((size_ ? size_->size : (stride_ ? stride_->size : 0)) <= INT_MAX);
-#endif
-  THTensor_(setStorageNd)(self,
-                          storage_,
-                          storageOffset_,
-                          (size_ ? size_->size : (stride_ ? stride_->size : 0)),
-                          (size_ ? size_->data : NULL),
-                          (stride_ ? stride_->data : NULL));
-}
-
-void THTensor_(setStorage1d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
-                             long size0_, long stride0_)
-{
-  THTensor_(setStorage4d)(self, storage_, storageOffset_,
-                          size0_, stride0_,
-                          -1, -1,
-                          -1, -1,
-                          -1, -1);
-}
-
-void THTensor_(setStorage2d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
-                             long size0_, long stride0_,
-                             long size1_, long stride1_)
-{
-  THTensor_(setStorage4d)(self, storage_, storageOffset_,
-                          size0_, stride0_,
-                          size1_, stride1_,
-                          -1, -1,
-                          -1, -1);
-}
-
-void THTensor_(setStorage3d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
-                             long size0_, long stride0_,
-                             long size1_, long stride1_,
-                             long size2_, long stride2_)
-{
-  THTensor_(setStorage4d)(self, storage_, storageOffset_,
-                          size0_, stride0_,
-                          size1_, stride1_,
-                          size2_, stride2_,
-                          -1, -1);
-}
-
-void THTensor_(setStorage4d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
-                             long size0_, long stride0_,
-                             long size1_, long stride1_,
-                             long size2_, long stride2_,
-                             long size3_, long stride3_)
-{
-
-  long size[4] = {size0_, size1_, size2_, size3_};
-  long stride[4] = {stride0_, stride1_, stride2_, stride3_};
-
-  THTensor_(setStorageNd)(self, storage_, storageOffset_, 4, size, stride);
-}
-
-
-void THTensor_(narrow)(THTensor *self, THTensor *src, int dimension, long firstIndex, long size)
-{
-  if(!src)
-    src = self;
-
-  THArgCheck( (dimension >= 0) && (dimension < src->nDimension), 2, "out of range");
-  THArgCheck( (firstIndex >= 0) && (firstIndex < src->size[dimension]), 3, "out of range");
-  THArgCheck( (size > 0) && (firstIndex <= src->size[dimension] - size), 4, "out of range");
-
-  THTensor_(set)(self, src);
-
-  if(firstIndex > 0)
-    self->storageOffset += firstIndex*self->stride[dimension];
-
-  self->size[dimension] = size;
-}
-
-void THTensor_(select)(THTensor *self, THTensor *src, int dimension, long sliceIndex)
-{
-  int d;
-
-  if(!src)
-    src = self;
-
-  THArgCheck(src->nDimension > 1, 1, "cannot select on a vector");
-  THArgCheck((dimension >= 0) && (dimension < src->nDimension), 2, "out of range");
-  THArgCheck((sliceIndex >= 0) && (sliceIndex < src->size[dimension]), 3, "out of range");
-
-  THTensor_(set)(self, src);
-  THTensor_(narrow)(self, NULL, dimension, sliceIndex, 1);
-  for(d = dimension; d < self->nDimension-1; d++)
-  {
-    self->size[d] = self->size[d+1];
-    self->stride[d] = self->stride[d+1];
-  }
-  self->nDimension--;
-}
-
-void THTensor_(transpose)(THTensor *self, THTensor *src, int dimension1, int dimension2)
-{
-  long z;
-
-  if(!src)
-    src = self;
-
-  THArgCheck( (dimension1 >= 0) && (dimension1 < src->nDimension), 1, "out of range");
-  THArgCheck( (dimension2 >= 0) && (dimension2 < src->nDimension), 2, "out of range");
-
-  THTensor_(set)(self, src);
-
-  if(dimension1 == dimension2)
-    return;
-
-  z = self->stride[dimension1];
-  self->stride[dimension1] = self->stride[dimension2];
-  self->stride[dimension2] = z;
-  z = self->size[dimension1];
-  self->size[dimension1] = self->size[dimension2];
-  self->size[dimension2] = z;
-}
-
-void THTensor_(unfold)(THTensor *self, THTensor *src, int dimension, long size, long step)
-{
-  long *newSize;
-  long *newStride;
-  int d;
-
-  if(!src)
-    src = self;
-
-  THArgCheck( (src->nDimension > 0), 1, "cannot unfold an empty tensor");
-  THArgCheck((dimension >= 0) && (dimension < src->nDimension), 2, "out of range");
-  THArgCheck(size <= src->size[dimension], 3, "out of range");
-  THArgCheck(step > 0, 4, "invalid step");
-
-  THTensor_(set)(self, src);
-
-  newSize = THAlloc(sizeof(long)*(self->nDimension+1));
-  newStride = THAlloc(sizeof(long)*(self->nDimension+1));
-
-  newSize[self->nDimension] = size;
-  newStride[self->nDimension] = self->stride[dimension];
-  for(d = 0; d < self->nDimension; d++)
-  {
-    if(d == dimension)
-    {
-      newSize[d] = (self->size[d] - size) / step + 1;
-      newStride[d] = step*self->stride[d];
-    }
-    else
-    {
-      newSize[d] = self->size[d];
-      newStride[d] = self->stride[d];
-    }
-  }
-
-  THFree(self->size);
-  THFree(self->stride);
-
-  self->size = newSize;
-  self->stride = newStride;
-  self->nDimension++;
-}
-
-/* we have to handle the case where the result is a number */
-void THTensor_(squeeze)(THTensor *self, THTensor *src)
-{
-  int ndim = 0;
-  int d;
-
-  if(!src)
-    src = self;
-
-  THTensor_(set)(self, src);
-
-  for(d = 0; d < src->nDimension; d++)
-  {
-    if(src->size[d] != 1)
-    {
-      if(d != ndim)
-      {
-        self->size[ndim] = src->size[d];
-        self->stride[ndim] = src->stride[d];
-      }
-      ndim++;
-    }
-  }
-
-  /* right now, we do not handle 0-dimension tensors */
-  if(ndim == 0 && src->nDimension > 0)
-  {
-    self->size[0] = 1;
-    self->stride[0] = 1;
-    ndim = 1;
-  }
-  self->nDimension = ndim;
-}
-
-void THTensor_(squeeze1d)(THTensor *self, THTensor *src, int dimension)
-{
-  int d;
-
-  if(!src)
-    src = self;
-
-  THArgCheck((dimension >= 0) && (dimension < src->nDimension), 2, "dimension out of range");
-
-  THTensor_(set)(self, src);
-
-  if(src->size[dimension] == 1 && src->nDimension > 1)
-  {
-    for(d = dimension; d < self->nDimension-1; d++)
-    {
-      self->size[d] = self->size[d+1];
-      self->stride[d] = self->stride[d+1];
-    }
-    self->nDimension--;
-  }
-}
-
-void THTensor_(unsqueeze1d)(THTensor *self, THTensor *src, int dimension)
-{
-  int d;
-
-  if(!src)
-    src = self;
-
-  THArgCheck((dimension >= 0) && (dimension <= src->nDimension), 2, "dimension out of range");
-  THArgCheck(src->nDimension > 0, 2, "cannot unsqueeze empty tensor");
-
-  THTensor_(set)(self, src);
-
-  self->size = (long*)THRealloc(self->size, sizeof(long)*(self->nDimension+1));
-  self->stride = (long*)THRealloc(self->stride, sizeof(long)*(self->nDimension+1));
-  self->nDimension++;
-  for (d = self->nDimension-1; d > dimension; d--) {
-    self->size[d] = self->size[d-1];
-    self->stride[d] = self->stride[d-1];
-  }
-  if (dimension+1 < self->nDimension) {
-    self->stride[dimension] = self->size[dimension+1] * self->stride[dimension+1];
-  } else {
-    self->stride[dimension] = 1;
-  }
-  self->size[dimension] = 1;
-}
-
-int THTensor_(isTransposed)(const THTensor *self)
-{
-  if (THTensor_(isContiguous)(self)) {
-    return 0;
-  }
-  long max_stride = 1;
-  long size_max_stride = 1;
-  long z = 1;
-  int d;
-  for (d = 0; d < self->nDimension; ++d) {
-    if (self->stride[d] == 0 && self->size[d] != 1)
-      return 0;
-    if (self->stride[d] > max_stride) {
-      max_stride = self->stride[d];
-      size_max_stride = self->size[d];
-    }
-    z *= self->size[d];
-  }
-  if (z == max_stride * size_max_stride) {
-    return 1;
-  }
-  return 0;
-}
-
-int THTensor_(isContiguous)(const THTensor *self)
-{
-  long z = 1;
-  int d;
-  for(d = self->nDimension-1; d >= 0; d--)
-  {
-    if(self->size[d] != 1)
-    {
-      if(self->stride[d] == z)
-        z *= self->size[d];
-      else
-        return 0;
-    }
-  }
-  return 1;
-}
-
-int THTensor_(isSize)(const THTensor *self, const THLongStorage *dims)
-{
-  int d;
-  if (self->nDimension != dims->size)
-    return 0;
-
-  for(d = 0; d < self->nDimension; ++d)
-  {
-    if(self->size[d] != dims->data[d])
-      return 0;
-  }
-  return 1;
-}
-
-int THTensor_(isSameSizeAs)(const THTensor *self, const THTensor* src)
-{
-  int d;
-  if (self->nDimension != src->nDimension)
-    return 0;
-  for(d = 0; d < self->nDimension; ++d)
-  {
-    if(self->size[d] != src->size[d])
-      return 0;
-  }
-  return 1;
-}
-
-int THTensor_(isSetTo)(const THTensor *self, const THTensor* src)
-{
-  if (!self->storage)
-    return 0;
-  if (self->storage == src->storage &&
-      self->storageOffset == src->storageOffset &&
-      self->nDimension == src->nDimension)
-  {
-    int d;
-    for (d = 0; d < self->nDimension; ++d)
-    {
-      if (self->size[d] != src->size[d] || self->stride[d] != src->stride[d])
-        return 0;
-    }
-    return 1;
-  }
-  return 0;
-}
-
-ptrdiff_t THTensor_(nElement)(const THTensor *self)
-{
-  if(self->nDimension == 0)
-    return 0;
-  else
-  {
-    ptrdiff_t nElement = 1;
-    int d;
-    for(d = 0; d < self->nDimension; d++)
-      nElement *= self->size[d];
-    return nElement;
-  }
-}
-
-void THTensor_(retain)(THTensor *self)
-{
-  if(self->flag & TH_TENSOR_REFCOUNTED)
-    THAtomicIncrementRef(&self->refcount);
-}
-
-void THTensor_(free)(THTensor *self)
-{
-  if(!self)
-    return;
-
-  if(self->flag & TH_TENSOR_REFCOUNTED)
-  {
-    if(THAtomicDecrementRef(&self->refcount))
-    {
-      THFree(self->size);
-      THFree(self->stride);
-      if(self->storage)
-        THStorage_(free)(self->storage);
-      THFree(self);
-    }
-  }
-}
-
-void THTensor_(freeCopyTo)(THTensor *self, THTensor *dst)
-{
-  if(self != dst)
-    THTensor_(copy)(dst, self);
-
-  THTensor_(free)(self);
-}
-
-/*******************************************************************************/
-
-static void THTensor_(rawInit)(THTensor *self)
-{
-  self->refcount = 1;
-  self->storage = NULL;
-  self->storageOffset = 0;
-  self->size = NULL;
-  self->stride = NULL;
-  self->nDimension = 0;
-  self->flag = TH_TENSOR_REFCOUNTED;
-}
-
-void THTensor_(setStorageNd)(THTensor *self, THStorage *storage, ptrdiff_t storageOffset, int nDimension, long *size, long *stride)
-{
-  /* storage */
-  if(self->storage != storage)
-  {
-    if(self->storage)
-      THStorage_(free)(self->storage);
-
-    if(storage)
-    {
-      self->storage = storage;
-      THStorage_(retain)(self->storage);
-    }
-    else
-      self->storage = NULL;
-  }
-
-  /* storageOffset */
-  if(storageOffset < 0)
-    THError("Tensor: invalid storage offset");
-  self->storageOffset = storageOffset;
-
-  /* size and stride */
-  THTensor_(resizeNd)(self, nDimension, size, stride);
-}
-
-void THTensor_(resizeNd)(THTensor *self, int nDimension, long *size, long *stride)
-{
-  int d;
-  int nDimension_;
-  ptrdiff_t totalSize;
-  int hascorrectsize = 1;
-
-  nDimension_ = 0;
-  for(d = 0; d < nDimension; d++)
-  {
-    if(size[d] > 0)
-    {
-      nDimension_++;
-      if((self->nDimension > d) && (size[d] != self->size[d]))
-        hascorrectsize = 0;
-
-      if((self->nDimension > d) && stride && (stride[d] >= 0) && (stride[d] != self->stride[d]))
-        hascorrectsize = 0;
-    }
-    else
-      break;
-  }
-  nDimension = nDimension_;
-
-  if(nDimension != self->nDimension)
-    hascorrectsize = 0;
-
-  if(hascorrectsize)
-    return;
-
-  if(nDimension > 0)
-  {
-    if(nDimension != self->nDimension)
-    {
-      self->size = THRealloc(self->size, sizeof(long)*nDimension);
-      self->stride = THRealloc(self->stride, sizeof(long)*nDimension);
-      self->nDimension = nDimension;
-    }
-
-    totalSize = 1;
-    for(d = self->nDimension-1; d >= 0; d--)
-    {
-      self->size[d] = size[d];
-      if(stride && (stride[d] >= 0) )
-        self->stride[d] = stride[d];
-      else
-      {
-        if(d == self->nDimension-1)
-          self->stride[d] = 1;
-        else
-          self->stride[d] = self->size[d+1]*self->stride[d+1];
-      }
-      totalSize += (self->size[d]-1)*self->stride[d];
-    }
-
-    if(totalSize+self->storageOffset > 0)
-    {
-      if(!self->storage)
-        self->storage = THStorage_(new)();
-      if(totalSize+self->storageOffset > self->storage->size)
-        THStorage_(resize)(self->storage, totalSize+self->storageOffset);
-    }
-  }
-  else
-    self->nDimension = 0;
-}
-
-void THTensor_(set1d)(THTensor *tensor, long x0, real value)
-{
-  THArgCheck(tensor->nDimension == 1, 1, "tensor must have one dimension");
-  THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range");
-  THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0], value);
-}
-
-real THTensor_(get1d)(const THTensor *tensor, long x0)
-{
-  THArgCheck(tensor->nDimension == 1, 1, "tensor must have one dimension");
-  THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]), 2, "out of range");
-  return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]);
-}
-
-void THTensor_(set2d)(THTensor *tensor, long x0, long x1, real value)
-{
-  THArgCheck(tensor->nDimension == 2, 1, "tensor must have two dimensions");
-  THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range");
-  THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1], value);
-}
-
-real THTensor_(get2d)(const THTensor *tensor, long x0, long x1)
-{
-  THArgCheck(tensor->nDimension == 2, 1, "tensor must have two dimensions");
-  THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]), 2, "out of range");
-  return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]);
-}
-
-void THTensor_(set3d)(THTensor *tensor, long x0, long x1, long x2, real value)
-{
-  THArgCheck(tensor->nDimension == 3, 1, "tensor must have three dimensions");
-  THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range");
-  THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2], value);
-}
-
-real THTensor_(get3d)(const THTensor *tensor, long x0, long x1, long x2)
-{
-  THArgCheck(tensor->nDimension == 3, 1, "tensor must have three dimensions");
-  THArgCheck( (x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]), 2, "out of range");
-  return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]);
-}
-
-void THTensor_(set4d)(THTensor *tensor, long x0, long x1, long x2, long x3, real value)
-{
-  THArgCheck(tensor->nDimension == 4, 1, "tensor must have four dimensions");
-  THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range");
-  THStorage_(set)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]+x3*tensor->stride[3], value);
-}
-
-real THTensor_(get4d)(const THTensor *tensor, long x0, long x1, long x2, long x3)
-{
-  THArgCheck(tensor->nDimension == 4, 1, "tensor must have four dimensions");
-  THArgCheck((x0 >= 0) && (x0 < tensor->size[0]) && (x1 >= 0) && (x1 < tensor->size[1]) && (x2 >= 0) && (x2 < tensor->size[2]) && (x3 >= 0) && (x3 < tensor->size[3]), 2, "out of range");
-  return THStorage_(get)(tensor->storage, tensor->storageOffset+x0*tensor->stride[0]+x1*tensor->stride[1]+x2*tensor->stride[2]+x3*tensor->stride[3]);
-}
-
-THDescBuff THTensor_(desc)(const THTensor *tensor) {
-  const int L = TH_DESC_BUFF_LEN;
-  THDescBuff buf;
-  char *str = buf.str;
-  int n = 0;
-#define _stringify(x) #x
-  n += snprintf(str, L-n, "torch." _stringify(x) "Tensor of size ");
-#undef _stringify
-  int i;
-  for(i = 0; i < tensor->nDimension; i++) {
-    if(n >= L) break;
-    n += snprintf(str+n, L-n, "%ld", tensor->size[i]);
-    if(i < tensor->nDimension-1) {
-      n += snprintf(str+n, L-n, "x");
-    }
-  }
-  if(n >= L) {
-    snprintf(str+L-4, 4, "...");
-  }
-  return buf;
-}
-
-THDescBuff THTensor_(sizeDesc)(const THTensor *tensor) {
-  THLongStorage *size = THTensor_(newSizeOf)((THTensor*)tensor);
-  THDescBuff buf = THLongStorage_sizeDesc(size);
-  THLongStorage_free(size);
-  return buf;
-}
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/THTensor.h b/contrib/lua-torch/torch7/lib/TH/generic/THTensor.h
deleted file mode 100644
index 9fb246c85..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/THTensor.h
+++ /dev/null
@@ -1,138 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/THTensor.h"
-#else
-
-/* a la lua? dim, storageoffset, ...  et les methodes ? */
-
-#define TH_TENSOR_REFCOUNTED 1
-
-typedef struct THTensor
-{
-    long *size;
-    long *stride;
-    int nDimension;
-
-    THStorage *storage;
-    ptrdiff_t storageOffset;
-    int refcount;
-
-    char flag;
-
-} THTensor;
-
-
-/**** access methods ****/
-TH_API THStorage* THTensor_(storage)(const THTensor *self);
-TH_API ptrdiff_t THTensor_(storageOffset)(const THTensor *self);
-TH_API int THTensor_(nDimension)(const THTensor *self);
-TH_API long THTensor_(size)(const THTensor *self, int dim);
-TH_API long THTensor_(stride)(const THTensor *self, int dim);
-TH_API THLongStorage *THTensor_(newSizeOf)(THTensor *self);
-TH_API THLongStorage *THTensor_(newStrideOf)(THTensor *self);
-TH_API real *THTensor_(data)(const THTensor *self);
-
-TH_API void THTensor_(setFlag)(THTensor *self, const char flag);
-TH_API void THTensor_(clearFlag)(THTensor *self, const char flag);
-
-
-/**** creation methods ****/
-TH_API THTensor *THTensor_(new)(void);
-TH_API THTensor *THTensor_(newWithTensor)(THTensor *tensor);
-/* stride might be NULL */
-TH_API THTensor *THTensor_(newWithStorage)(THStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_);
-TH_API THTensor *THTensor_(newWithStorage1d)(THStorage *storage_, ptrdiff_t storageOffset_,
-                                long size0_, long stride0_);
-TH_API THTensor *THTensor_(newWithStorage2d)(THStorage *storage_, ptrdiff_t storageOffset_,
-                                long size0_, long stride0_,
-                                long size1_, long stride1_);
-TH_API THTensor *THTensor_(newWithStorage3d)(THStorage *storage_, ptrdiff_t storageOffset_,
-                                long size0_, long stride0_,
-                                long size1_, long stride1_,
-                                long size2_, long stride2_);
-TH_API THTensor *THTensor_(newWithStorage4d)(THStorage *storage_, ptrdiff_t storageOffset_,
-                                long size0_, long stride0_,
-                                long size1_, long stride1_,
-                                long size2_, long stride2_,
-                                long size3_, long stride3_);
-
-/* stride might be NULL */
-TH_API THTensor *THTensor_(newWithSize)(THLongStorage *size_, THLongStorage *stride_);
-TH_API THTensor *THTensor_(newWithSize1d)(long size0_);
-TH_API THTensor *THTensor_(newWithSize2d)(long size0_, long size1_);
-TH_API THTensor *THTensor_(newWithSize3d)(long size0_, long size1_, long size2_);
-TH_API THTensor *THTensor_(newWithSize4d)(long size0_, long size1_, long size2_, long size3_);
-
-TH_API THTensor *THTensor_(newClone)(THTensor *self);
-TH_API THTensor *THTensor_(newContiguous)(THTensor *tensor);
-TH_API THTensor *THTensor_(newSelect)(THTensor *tensor, int dimension_, long sliceIndex_);
-TH_API THTensor *THTensor_(newNarrow)(THTensor *tensor, int dimension_, long firstIndex_, long size_);
-TH_API THTensor *THTensor_(newTranspose)(THTensor *tensor, int dimension1_, int dimension2_);
-TH_API THTensor *THTensor_(newUnfold)(THTensor *tensor, int dimension_, long size_, long step_);
-TH_API THTensor *THTensor_(newView)(THTensor *tensor, THLongStorage *size);
-TH_API THTensor *THTensor_(newExpand)(THTensor *tensor, THLongStorage *size);
-
-TH_API void THTensor_(expand)(THTensor *r, THTensor *tensor, THLongStorage *size);
-TH_API void THTensor_(expandNd)(THTensor **rets, THTensor **ops, int count);
-
-TH_API void THTensor_(resize)(THTensor *tensor, THLongStorage *size, THLongStorage *stride);
-TH_API void THTensor_(resizeAs)(THTensor *tensor, THTensor *src);
-TH_API void THTensor_(resizeNd)(THTensor *tensor, int nDimension, long *size, long *stride);
-TH_API void THTensor_(resize1d)(THTensor *tensor, long size0_);
-TH_API void THTensor_(resize2d)(THTensor *tensor, long size0_, long size1_);
-TH_API void THTensor_(resize3d)(THTensor *tensor, long size0_, long size1_, long size2_);
-TH_API void THTensor_(resize4d)(THTensor *tensor, long size0_, long size1_, long size2_, long size3_);
-TH_API void THTensor_(resize5d)(THTensor *tensor, long size0_, long size1_, long size2_, long size3_, long size4_);
-
-TH_API void THTensor_(set)(THTensor *self, THTensor *src);
-TH_API void THTensor_(setStorage)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, THLongStorage *size_, THLongStorage *stride_);
-TH_API void THTensor_(setStorageNd)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_, int nDimension, long *size, long *stride);
-TH_API void THTensor_(setStorage1d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
-                                    long size0_, long stride0_);
-TH_API void THTensor_(setStorage2d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
-                                    long size0_, long stride0_,
-                                    long size1_, long stride1_);
-TH_API void THTensor_(setStorage3d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
-                                    long size0_, long stride0_,
-                                    long size1_, long stride1_,
-                                    long size2_, long stride2_);
-TH_API void THTensor_(setStorage4d)(THTensor *self, THStorage *storage_, ptrdiff_t storageOffset_,
-                                    long size0_, long stride0_,
-                                    long size1_, long stride1_,
-                                    long size2_, long stride2_,
-                                    long size3_, long stride3_);
-
-TH_API void THTensor_(narrow)(THTensor *self, THTensor *src, int dimension_, long firstIndex_, long size_);
-TH_API void THTensor_(select)(THTensor *self, THTensor *src, int dimension_, long sliceIndex_);
-TH_API void THTensor_(transpose)(THTensor *self, THTensor *src, int dimension1_, int dimension2_);
-TH_API void THTensor_(unfold)(THTensor *self, THTensor *src, int dimension_, long size_, long step_);
-
-TH_API void THTensor_(squeeze)(THTensor *self, THTensor *src);
-TH_API void THTensor_(squeeze1d)(THTensor *self, THTensor *src, int dimension_);
-TH_API void THTensor_(unsqueeze1d)(THTensor *self, THTensor *src, int dimension_);
-
-TH_API int THTensor_(isContiguous)(const THTensor *self);
-TH_API int THTensor_(isSameSizeAs)(const THTensor *self, const THTensor *src);
-TH_API int THTensor_(isSetTo)(const THTensor *self, const THTensor *src);
-TH_API int THTensor_(isSize)(const THTensor *self, const THLongStorage *dims);
-TH_API ptrdiff_t THTensor_(nElement)(const THTensor *self);
-
-TH_API void THTensor_(retain)(THTensor *self);
-TH_API void THTensor_(free)(THTensor *self);
-TH_API void THTensor_(freeCopyTo)(THTensor *self, THTensor *dst);
-
-/* Slow access methods [check everything] */
-TH_API void THTensor_(set1d)(THTensor *tensor, long x0, real value);
-TH_API void THTensor_(set2d)(THTensor *tensor, long x0, long x1, real value);
-TH_API void THTensor_(set3d)(THTensor *tensor, long x0, long x1, long x2, real value);
-TH_API void THTensor_(set4d)(THTensor *tensor, long x0, long x1, long x2, long x3, real value);
-
-TH_API real THTensor_(get1d)(const THTensor *tensor, long x0);
-TH_API real THTensor_(get2d)(const THTensor *tensor, long x0, long x1);
-TH_API real THTensor_(get3d)(const THTensor *tensor, long x0, long x1, long x2);
-TH_API real THTensor_(get4d)(const THTensor *tensor, long x0, long x1, long x2, long x3);
-
-/* Debug methods */
-TH_API THDescBuff THTensor_(desc)(const THTensor *tensor);
-TH_API THDescBuff THTensor_(sizeDesc)(const THTensor *tensor);
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/THTensorConv.c b/contrib/lua-torch/torch7/lib/TH/generic/THTensorConv.c
deleted file mode 100644
index 684ff9db5..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/THTensorConv.c
+++ /dev/null
@@ -1,1957 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/THTensorConv.c"
-#else
-
-/*
-  2D Input, 2D kernel  : convolve given image with the given kernel.
-*/
-void THTensor_(validXCorr2Dptr)(real *r_,
-                                       real alpha,
-                                       real *t_, long ir, long ic,
-                                       real *k_, long kr, long kc,
-                                       long sr, long sc)
-{
-  long or = (ir - kr) / sr + 1;
-  long oc = (ic - kc) / sc + 1;
-
-  long xx, yy, kx, ky;
-
-  if ((sc != 1) || (oc < 4))  {
-    /* regular convolution */
-    for(yy = 0; yy < or; yy++) {
-      for(xx = 0; xx < oc; xx++) {
-        /* Dot product in two dimensions... (between input image and the mask) */
-        real *pi_ = t_ + yy*sr*ic + xx*sc;
-        real *pw_ = k_;
-        real sum = 0;
-        for(ky = 0; ky < kr; ky++) {
-          for(kx = 0; kx < kc; kx++) {
-            sum += pi_[kx]*pw_[kx];
-          }
-          pi_ += ic; /* next input line */
-          pw_ += kc; /* next mask line */
-        }
-        /* Update output */
-        *r_++ += alpha*sum;
-      }
-    }
-
-  } else {
-    /* SSE-based convolution */
-    for(yy = 0; yy < or; yy++) {
-      real *pi_ = t_ + yy*sr*ic;
-      real *pw_ = k_;
-      for (ky = 0; ky < kr; ky++) {
-        real *pis_ = pi_;
-        for (kx = 0; kx < kc; kx++) {
-          THVector_(cadd)(r_, r_, pis_, alpha*pw_[kx], oc);
-          pis_++;
-        }
-        pi_ += ic; /* next input line */
-        pw_ += kc; /* next mask line */
-      }
-      r_ += oc;
-    }
-  }
-}
-
-/*
-  2D Input, 2D kernel  : convolve given image with the given kernel.
-*/
-void THTensor_(validConv2Dptr)(real *r_,
-                                      real alpha,
-                                      real *t_, long ir, long ic,
-                                      real *k_, long kr, long kc,
-                                      long sr, long sc)
-{
-  long or = (ir - kr) / sr + 1;
-  long oc = (ic - kc) / sc + 1;
-
-  long xx, yy, kx, ky;
-
-  if ((sc != 1) || (oc < 4))  {
-    /* regular convolution */
-    for(yy = 0; yy < or; yy++) {
-      for(xx = 0; xx < oc; xx++) {
-        /* Dot product in two dimensions... (between input image and the mask) */
-        real *pi_ = t_ + yy*sr*ic + xx*sc;
-        real *pw_ = k_ + kr*kc - 1;
-        real sum = 0;
-        for(ky = 0; ky < kr; ky++) {
-          for(kx = 0; kx < kc; kx++) {
-            sum += pi_[kx]*pw_[-kx];
-          }
-          pi_ += ic; /* next input line */
-          pw_ -= kc; /* next mask line */
-        }
-        /* Update output */
-        *r_++ += alpha*sum;
-      }
-    }
-
-  } else {
-    /* SSE-based convolution */
-    for(yy = 0; yy < or; yy++) {
-      real *pw_ = k_ + kr*kc - 1;
-      real *pi_ = t_ + yy*sr*ic;
-      for (ky = 0; ky < kr; ky++) {
-        real *pis_ = pi_;
-        for (kx = 0; kx < kc; kx++) {
-          THVector_(cadd)(r_, r_, pis_, alpha*pw_[-kx], oc);
-          pis_++;
-        }
-        pi_ += ic; /* next input line */
-        pw_ -= kc; /* next mask line */
-      }
-      r_ += oc;
-    }
-  }
-}
-
-/*
-  2D Input, 2D kernel  : convolve given image with the given kernel, full convolution.
-*/
-void THTensor_(fullConv2Dptr)(real *r_,
-                                     real alpha,
-                                     real *t_, long ir, long ic,
-                                     real *k_, long kr, long kc,
-                                     long sr, long sc)
-{
-  long oc = (ic - 1) * sc + kc;
-
-  long xx, yy, kx, ky;
-
-  if ((sc != 1) || (ic < 4))  {
-    /* regular convolution */
-    for(yy = 0; yy < ir; yy++) {
-      for(xx = 0; xx < ic; xx++) {
-        /* Outer product in two dimensions... (between input image and the mask) */
-        real *po_ = r_ + yy*sr*oc + xx*sc;
-        real *pw_ = k_;
-        for(ky = 0; ky < kr; ky++)
-        {
-          real z = *t_ * alpha;
-          for(kx = 0; kx < kc; kx++) {
-            po_[kx] += z * pw_[kx];
-          }
-          po_ += oc; /* next input line */
-          pw_ += kc; /* next mask line */
-        }
-        t_++;
-      }
-    }
-
-  } else {
-    /* SSE-based convolution */
-    for(yy = 0; yy < ir; yy++) {
-      real *po_ = r_ + yy*sr*oc;
-      real *pw_ = k_;
-      for (ky = 0; ky < kr; ky++) {
-        real *pos_ = po_;
-        for (kx = 0; kx < kc; kx++) {
-          THVector_(cadd)(pos_, pos_, t_, alpha*pw_[kx], ic);
-          pos_++;
-        }
-        po_ += oc; /* next input line */
-        pw_ += kc; /* next mask line */
-      }
-      t_ += ic;
-    }
-  }
-}
-
-/*
-  2D Input, 2D kernel  : convolve given image with the given kernel, full convolution.
-*/
-void THTensor_(fullXCorr2Dptr)(real *r_,
-                                      real alpha,
-                                      real *t_, long ir, long ic,
-                                      real *k_, long kr, long kc,
-                                      long sr, long sc)
-{
-  long oc = (ic - 1) * sc + kc;
-
-  long xx, yy, kx, ky;
-
-  if ((sc != 1) || (ic < 4))  {
-    /* regular convolution */
-    for(yy = 0; yy < ir; yy++) {
-      for(xx = 0; xx < ic; xx++) {
-        /* Outer product in two dimensions... (between input image and the mask) */
-        real *po_ = r_ + yy*sr*oc + xx*sc;
-        real *pw_ = k_ + kr*kc -1;
-        long kx, ky;
-        for(ky = 0; ky < kr; ky++)
-        {
-          real z = *t_ * alpha;
-          for(kx = 0; kx < kc; kx++) {
-            po_[kx] += z * pw_[-kx];
-          }
-          po_ += oc; /* next input line */
-          pw_ -= kc; /* next mask line */
-        }
-        t_++;
-      }
-    }
-
-  } else {
-    /* SSE-based convolution */
-    for(yy = 0; yy < ir; yy++) {
-      real *po_ = r_ + yy*sr*oc;
-      real *pw_ = k_ + kr*kc -1;
-      for (ky = 0; ky < kr; ky++) {
-        real *pos_ = po_;
-        for (kx = 0; kx < kc; kx++) {
-          THVector_(cadd)(pos_, pos_, t_, pw_[-kx]*alpha, ic);
-          pos_++;
-        }
-        po_ += oc; /* next input line */
-        pw_ -= kc; /* next mask line */
-      }
-      t_ += ic;
-    }
-  }
-}
-
-/*
-  2D Input, 2D kernel  : convolve given image with the given kernel, valid convolution.
-  for sr,sc=1 this is equivalent to validXCorr2Dptr, but otherwise it is useful for
-  calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
-*/
-void THTensor_(validXCorr2DRevptr)(real *r_,
-                                          real alpha,
-                                          real *t_, long ir, long ic,
-                                          real *k_, long kr, long kc,
-                                          long sr, long sc)
-{
-  long or = ir - (kr - 1) * sr;
-  long oc = ic - (kc - 1) * sc;
-
-  long xx, yy, kx, ky;
-
-  if ((sc != 1) || (kc < 4))  {
-    /* regular convolution */
-    for(yy = 0; yy < kr; yy++) {
-      for(xx = 0; xx < kc; xx++) {
-        real *po_ = r_;
-        real *pi_ = t_ + yy*sr*ic + xx*sc;
-        real z = *k_++ * alpha;
-
-        for(ky = 0; ky < or; ky++) {
-          for(kx = 0; kx < oc; kx++)
-            po_[kx] += z * pi_[kx];
-          pi_ += ic;
-          po_ += oc;
-        }
-      }
-    }
-
-  } else {
-    /* SSE-based convolution */
-    for(yy = 0; yy < kr; yy++) {
-      for(xx = 0; xx < kc; xx++) {
-        real *po_ = r_;
-        real *pi_ = t_ + yy*sr*ic + xx*sc;
-        real z = *k_++ * alpha;
-
-        for(ky = 0; ky < or; ky++) {
-          THVector_(cadd)(po_, po_, pi_, z, oc);
-          pi_ += ic;
-          po_ += oc;
-        }
-      }
-    }
-  }
-}
-/*
-  3D Input, 3D kernel  : convolve given volume with the given kernel.
-*/
-void THTensor_(validXCorr3Dptr)(real *r_,
-                                       real alpha,
-                                       real *t_, long it, long ir, long ic,
-                                       real *k_, long kt, long kr, long kc,
-                                       long st, long sr, long sc)
-{
-  long ot = (it - kt) / st + 1;
-  long or = (ir - kr) / sr + 1;
-  long oc = (ic - kc) / sc + 1;
-
-  long zz, xx, yy;
-
-  for (zz = 0; zz < ot; zz++)
-  {
-    for(yy = 0; yy < or; yy++)
-    {
-      for(xx = 0; xx < oc; xx++)
-      {
-        /* Dot product in two dimensions... (between input image and the mask) */
-        real *pi_ = t_ + zz*st*ir*ic + yy*sr*ic + xx*sc;
-        real *pw_ = k_;
-        real sum = 0;
-        long kz, kx, ky;
-        for(kz = 0; kz < kt; kz++)
-        {
-          for(ky = 0; ky < kr; ky++)
-          {
-            for(kx = 0; kx < kc; kx++) {
-              sum += pi_[kx]*pw_[kx];
-            }
-            pi_ += ic; /* next input line */
-            pw_ += kc; /* next mask line */
-          }
-          pi_ += (ir-kr)*ic; /* next input slice */
-        }
-        /* Update output */
-        *r_++ += sum*alpha;
-      }
-    }
-  }
-}
-
-/*
-  3D Input, 3D kernel  : convolve given volume with the given kernel.
-*/
-void THTensor_(validConv3Dptr)(real *r_,
-                                      real alpha,
-                                      real *t_, long it, long ir, long ic,
-                                      real *k_, long kt, long kr, long kc,
-                                      long st, long sr, long sc)
-{
-  long ot = (it - kt) / st + 1;
-  long or = (ir - kr) / sr + 1;
-  long oc = (ic - kc) / sc + 1;
-
-  long zz, xx, yy;
-
-  for(zz = 0; zz < ot; zz++)
-  {
-    for(yy = 0; yy < or; yy++)
-    {
-      for(xx = 0; xx < oc; xx++)
-      {
-        /* Dot product in two dimensions... (between input image and the mask) */
-        real *pi_ = t_ + zz*st*ir*ic + yy*sr*ic + xx*sc;
-        real *pw_ = k_ + kt*kr*kc - 1;
-        real sum = 0;
-        long kz, kx, ky;
-        for(kz = 0; kz < kt; kz++)
-        {
-          for(ky = 0; ky < kr; ky++)
-          {
-            for(kx = 0; kx < kc; kx++) {
-              sum += pi_[kx]*pw_[-kx];
-            }
-            pi_ += ic; /* next input line */
-            pw_ -= kc; /* next mask line */
-          }
-          pi_ += (ir-kr)*ic; /* next input slice */
-        }
-        /* Update output */
-        *r_++ += alpha*sum;
-      }
-    }
-  }
-}
-
-
-/*
-  3D Input, 3D kernel  : convolve given volume with the given kernel, full convolution.
-*/
-void THTensor_(fullConv3Dptr)(real *r_,
-                                     real alpha,
-                                     real *t_, long it, long ir, long ic,
-                                     real *k_, long kt, long kr, long kc,
-                                     long st, long sr, long sc)
-{
-  long or = (ir - 1) * sr + kr;
-  long oc = (ic - 1) * sc + kc;
-
-  long zz, xx, yy;
-
-  for(zz = 0; zz < it; zz++)
-  {
-    for(yy = 0; yy < ir; yy++)
-    {
-      for(xx = 0; xx < ic; xx++)
-      {
-        /* Outer product in two dimensions... (between input image and the mask) */
-        real *po_ = r_ + zz*st*or*oc + yy*sr*oc + xx*sc;
-        real *pw_ = k_;
-        long kz, kx, ky;
-        /* printf("Output Plane : %ld,%ld,%ld, input val=%g\n",zz,yy,xx,*t_); */
-        for(kz = 0; kz < kt; kz++)
-        {
-          for(ky = 0; ky < kr; ky++)
-          {
-            real z = *t_ * alpha;
-            for(kx = 0; kx < kc; kx++) {
-              /* printf("o=%g,k=%g," , po_[kx],pw_[kx]); */
-              po_[kx] += z * pw_[kx];
-              /* printf("o=%g " , po_[kx]); */
-            }
-            /* printf("\n"); */
-            po_ += oc; /* next input line */
-            pw_ += kc; /* next mask line */
-          }
-          po_ += (or-kr)*oc; /* next output slice */
-          /* printf("\n"); */
-        }
-        t_++;
-      }
-    }
-  }
-}
-
-/*
-  3D Input, 3D kernel  : convolve given volume with the given kernel, full convolution.
-*/
-void THTensor_(fullXCorr3Dptr)(real *r_,
-                                      real alpha,
-                                      real *t_, long it, long ir, long ic,
-                                      real *k_, long kt, long kr, long kc,
-                                      long st, long sr, long sc)
-{
-  long or = (ir - 1) * sr + kr;
-  long oc = (ic - 1) * sc + kc;
-
-  long zz, xx, yy;
-
-  for(zz = 0; zz < it; zz++)
-  {
-    for(yy = 0; yy < ir; yy++)
-    {
-      for(xx = 0; xx < ic; xx++)
-      {
-        /* Outer product in two dimensions... (between input image and the mask) */
-        real *po_ = r_ + zz*st*or*oc + yy*sr*oc + xx*sc;
-        real *pw_ = k_ + kt*kr*kc -1;
-        long kz, kx, ky;
-        for(kz = 0; kz < kt; kz++)
-        {
-          for(ky = 0; ky < kr; ky++)
-          {
-            real z = *t_ * alpha;
-            for(kx = 0; kx < kc; kx++) {
-              po_[kx] += z * pw_[-kx];
-            }
-            po_ += oc; /* next input line */
-            pw_ -= kc; /* next mask line */
-          }
-          po_ += (or-kr)*oc; /* next output slice */
-        }
-        t_++;
-      }
-    }
-  }
-}
-
-/*
-  3D Input, 3D kernel  : convolve given image with the given kernel, valid convolution.
-  for sr,sc=1 this is equivalent to validXCorr3Dptr, but otherwise it is useful for
-  calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
-*/
-void THTensor_(validXCorr3DRevptr)(real *r_,
-                                          real alpha,
-                                          real *t_, long it, long ir, long ic,
-                                          real *k_, long kt, long kr, long kc,
-                                          long st, long sr, long sc)
-{
-  long ot = it - (kt - 1) * st;
-  long or = ir - (kr - 1) * sr;
-  long oc = ic - (kc - 1) * sc;
-
-  long zz, xx, yy;
-  for(zz = 0; zz < kt; zz++)
-  {
-    for(yy = 0; yy < kr; yy++)
-    {
-      for(xx = 0; xx < kc; xx++)
-      {
-        real *po_ = r_;
-        real *pi_ = t_ + zz*st*ir*ic + yy*sr*ic + xx*sc;
-        real z = *k_++ * alpha;
-        long kz, kx, ky;
-        for(kz = 0; kz < ot; kz++)
-        {
-          for(ky = 0; ky < or; ky++)
-          {
-            for(kx = 0; kx < oc; kx++)
-              po_[kx] += z * pi_[kx];
-            pi_ += ic;
-            po_ += oc;
-          }
-          pi_ += (ir-or)*ic; /* next input slice */
-        }
-      }
-    }
-  }
-}
-
-void THTensor_(conv2d)(real* output_data,
-                       real alpha,
-                       real* ptr_input, long nInputRows, long nInputCols,
-                       real* ptr_weight, long nKernelRows, long nKernelCols,
-                       long srow, long scol,
-                       const char *vf, const char *xc)
-{
-  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can be 'V' or 'F'");
-  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can be 'X' or 'C'");
-  if (*vf == 'F')
-    if (*xc == 'X')
-      THTensor_(fullXCorr2Dptr)(output_data,
-                                alpha,
-                                ptr_input,  nInputRows,  nInputCols,
-                                ptr_weight, nKernelRows, nKernelCols,
-                                srow, scol);
-    else
-      THTensor_(fullConv2Dptr)(output_data,
-                               alpha,
-                               ptr_input,  nInputRows,  nInputCols,
-                               ptr_weight, nKernelRows, nKernelCols,
-                               srow, scol);
-  else
-    if (*xc == 'X')
-      THTensor_(validXCorr2Dptr)(output_data,
-                                 alpha,
-                                 ptr_input,  nInputRows,  nInputCols,
-                                 ptr_weight, nKernelRows, nKernelCols,
-                                 srow, scol);
-    else
-      THTensor_(validConv2Dptr)(output_data,
-                                alpha,
-                                ptr_input,  nInputRows,  nInputCols,
-                                ptr_weight, nKernelRows, nKernelCols,
-                                srow, scol);
-}
-
-void THTensor_(conv3d)(real* output_data,
-                       real alpha,
-                       real* ptr_input, long nInputDepth, long nInputRows, long nInputCols,
-                       real* ptr_weight, long nKernelDepth, long nKernelRows, long nKernelCols,
-                       long sdepth, long srow, long scol,
-                       const char *vf, const char *xc)
-{
-  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can be 'V' or 'F'");
-  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can be 'X' or 'C'");
-  if (*vf == 'F')
-    if (*xc == 'X')
-      THTensor_(fullXCorr3Dptr)(output_data,
-                                alpha,
-                                ptr_input, nInputDepth, nInputRows,  nInputCols,
-                                ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
-                                sdepth, srow, scol);
-    else
-      THTensor_(fullConv3Dptr)(output_data,
-                               alpha,
-                               ptr_input, nInputDepth, nInputRows,  nInputCols,
-                               ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
-                               sdepth, srow, scol);
-  else
-    if (*xc == 'X')
-      THTensor_(validXCorr3Dptr)(output_data,
-                                 alpha,
-                                 ptr_input, nInputDepth, nInputRows,  nInputCols,
-                                 ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
-                                 sdepth, srow, scol);
-    else
-      THTensor_(validConv3Dptr)(output_data,
-                                alpha,
-                                ptr_input, nInputDepth, nInputRows,  nInputCols,
-                                ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
-                                sdepth, srow, scol);
-}
-
-long THTensor_(convsize)(long x, long k, long s, const char* vf)
-{
-  THArgCheck(*vf == 'V' || *vf == 'F', 1, "type of convolution can be 'V' or 'F'");
-  if (*vf == 'V')
-    return (x-k)/s + 1;
-  else
-    return (x-1)*s + k;
-}
-
-
-/*
-  3D input, 3D kernel, 4D output
-  like rank1 update
-  A <- xx' + beta*A
-  for sr,sc=1 this is equivalent to conv2Dger, but otherwise it is useful for
-  calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
-*/
-void THTensor_(conv2DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol)
-{
-  long nInputPlane, nInputRows, nInputCols;
-  long nKernelPlane, nKernelRows, nKernelCols;
-  long nOutputPlane, nOutputRows, nOutputCols;
-  long istride0, kstride0;
-  THTensor *input;
-  THTensor *kernel;
-  real *input_data;
-  real *weight_data;
-  real *output_data;
-  ptrdiff_t nelem;
-  long k;
-
-  THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
-  THArgCheck(k_->nDimension == 3 , 4, "kernel: 3D Tensor expected");
-  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
-  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
-
-  input = THTensor_(newContiguous)(t_);
-  kernel = THTensor_(newContiguous)(k_);
-
-  nInputPlane = input->size[0];
-  istride0    = input->stride[0];
-  nInputRows  = input->size[1];
-  nInputCols  = input->size[2];
-
-  kstride0 = kernel->stride[0];
-  nKernelPlane = kernel->size[0];
-  nKernelRows = kernel->size[1];
-  nKernelCols = kernel->size[2];
-  nOutputPlane = nInputPlane * kernel->size[0];
-
-  THArgCheck(nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, "covn2DRevger : Input image is smaller than kernel");
-
-  nOutputRows = nInputRows - (nKernelRows - 1) * srow;
-  nOutputCols = nInputCols - (nKernelCols - 1) * scol;
-
-  nelem = THTensor_(nElement)(r_);
-  THTensor_(resize4d)(r_,nKernelPlane, nInputPlane, nOutputRows, nOutputCols);
-
-  input_data = THTensor_(data)(input);
-  weight_data = THTensor_(data)(kernel);
-  output_data = THTensor_(data)(r_);
-
-  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
-  {
-    /*THTensor_(zero)(r_);*/
-
-#pragma omp parallel for private(k)
-    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
-    {
-      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
-      long l;
-      for (l = 0; l < nOutputRows*nOutputCols; l++)
-        ptr_output[l] = 0.0;
-    }
-  }
-  else if (beta != 1)
-  {
-    /*THTensor_(mul)(r_, beta);*/
-#pragma omp parallel for private(k)
-    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
-    {
-      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
-      long l;
-      for (l = 0; l < nOutputRows*nOutputCols; l++)
-        ptr_output[l] *= beta;
-    }
-  }
-
-#pragma omp parallel for private(k)
-  for(k = 0; k < nKernelPlane; k++)
-  {
-    long i;
-    /* get kernel */
-    real *ptr_weight = weight_data+k*kstride0;
-
-    for(i = 0; i < nInputPlane; i++)
-    {
-      /* get output */
-      real *ptr_output = output_data + k*nInputPlane*nOutputCols*nOutputRows + i*nOutputCols*nOutputRows;
-      /* get input */
-      real *ptr_input = input_data+i*istride0;
-
-      /* do image, kernel convolution */
-      THTensor_(validXCorr2DRevptr)(ptr_output,
-                                    alpha,
-                                    ptr_input,  nInputRows,  nInputCols,
-                                    ptr_weight, nKernelRows, nKernelCols,
-                                    srow, scol);
-      /* Next output plane */
-      /* output_data += nOutputCols*nOutputRows; */
-    }
-  }
-  THTensor_(free)(input);
-  THTensor_(free)(kernel);
-}
-
-
-/*
-  3D input, 3D kernel, 4D output
-  like rank1 update
-  A <- xx' + beta*A
-  for sr,sc=1 this is equivalent to conv2Dger, but otherwise it is useful for
-  calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
-*/
-void THTensor_(conv2DRevgerm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol)
-{
-  long nbatch, nInputPlane, nInputRows, nInputCols;
-  long nKernelPlane, nKernelRows, nKernelCols;
-  long nOutputRows, nOutputCols;
-  long istride0, kstride0, istride1, kstride1;
-  THTensor *input;
-  THTensor *kernel;
-  real *input_data;
-  real *weight_data;
-  real *output_data;
-  ptrdiff_t nelem;
-  long k;
-
-  THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
-  THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
-  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
-  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
-
-  input = THTensor_(newContiguous)(t_);
-  kernel = THTensor_(newContiguous)(k_);
-
-  istride0    = input->stride[0];
-  istride1    = input->stride[1];
-  nbatch      = input->size[0];
-  nInputPlane = input->size[1];
-  nInputRows  = input->size[2];
-  nInputCols  = input->size[3];
-
-  kstride0 = kernel->stride[0];
-  kstride1 = kernel->stride[1];
-  nKernelPlane = kernel->size[1];
-  nKernelRows = kernel->size[2];
-  nKernelCols = kernel->size[3];
-
-  THArgCheck(nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, "conv2DRevger : Input image is smaller than kernel");
-  THArgCheck(kernel->size[0] == input->size[0] , 2, "conv2DRevger : Input batch and kernel batch is not same size");
-
-  nOutputRows = nInputRows - (nKernelRows - 1) * srow;
-  nOutputCols = nInputCols - (nKernelCols - 1) * scol;
-
-  nelem = THTensor_(nElement)(r_);
-  THTensor_(resize4d)(r_,nKernelPlane, nInputPlane, nOutputRows, nOutputCols);
-
-  input_data = THTensor_(data)(input);
-  weight_data = THTensor_(data)(kernel);
-  output_data = THTensor_(data)(r_);
-
-  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
-  {
-    /*THTensor_(zero)(r_);*/
-
-#pragma omp parallel for private(k)
-    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
-    {
-      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
-      long l;
-      for (l = 0; l < nOutputRows*nOutputCols; l++)
-        ptr_output[l] = 0.0;
-    }
-  }
-  else if (beta != 1)
-  {
-    /*THTensor_(mul)(r_, beta);*/
-#pragma omp parallel for private(k)
-    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
-    {
-      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
-      long l;
-      for (l = 0; l < nOutputRows*nOutputCols; l++)
-        ptr_output[l] *= beta;
-    }
-  }
-
-#pragma omp parallel for private(k)
-  for(k = 0; k < nKernelPlane; k++)
-  {
-    long i;
-    for(i = 0; i < nInputPlane; i++)
-    {
-      long p;
-      for(p = 0; p < nbatch; p++)
-      {
-        /* get kernel */
-        real *ptr_weight = weight_data + p*kstride0 + k*kstride1;
-        /* get output */
-        real *ptr_output = output_data + k*nInputPlane*nOutputCols*nOutputRows + i*nOutputCols*nOutputRows;
-        /* get input */
-        real *ptr_input = input_data + p*istride0 + i*istride1;
-
-        /* do image, kernel convolution */
-        THTensor_(validXCorr2DRevptr)(ptr_output,
-                                      alpha,
-                                      ptr_input,  nInputRows,  nInputCols,
-                                      ptr_weight, nKernelRows, nKernelCols,
-                                      srow, scol);
-        /* Next output plane */
-        /* output_data += nOutputCols*nOutputRows; */
-      }
-    }
-  }
-  THTensor_(free)(input);
-  THTensor_(free)(kernel);
-}
-
-
-/*
-  3D input, 3D kernel, 4D output
-  like rank1 update
-  A <- xx' + beta*A
-*/
-void THTensor_(conv2Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc)
-{
-  long nInputPlane, nInputRows, nInputCols;
-  long nKernelPlane, nKernelRows, nKernelCols;
-  long nOutputPlane, nOutputRows, nOutputCols;
-  long istride0, kstride0;
-
-  THTensor *input;
-  THTensor *kernel;
-  real *input_data;
-  real *weight_data;
-  real *output_data;
-  ptrdiff_t nelem;
-  long k;
-
-  THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
-  THArgCheck(k_->nDimension == 3 , 4, "kernel: 3D Tensor expected");
-  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
-  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
-  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'");
-  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'");
-
-  input = THTensor_(newContiguous)(t_);
-  kernel = THTensor_(newContiguous)(k_);
-
-  nInputPlane = input->size[0];
-  istride0    = input->stride[0];
-  nInputRows  = input->size[1];
-  nInputCols  = input->size[2];
-
-  kstride0 = kernel->stride[0];
-  nKernelPlane = kernel->size[0];
-  nKernelRows = kernel->size[1];
-  nKernelCols = kernel->size[2];
-  nOutputPlane = nInputPlane * kernel->size[0];
-
-  THArgCheck((nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dger : Input image is smaller than kernel");
-
-  if (*vf == 'F') {
-    nOutputRows = (nInputRows - 1) * srow + nKernelRows;
-    nOutputCols = (nInputCols - 1) * scol + nKernelCols;
-  } else { /* valid */
-    nOutputRows = (nInputRows - nKernelRows) / srow + 1;
-    nOutputCols = (nInputCols - nKernelCols) / scol + 1;
-  }
-
-  nelem = THTensor_(nElement)(r_);
-  THTensor_(resize4d)(r_, nKernelPlane, nInputPlane, nOutputRows, nOutputCols);
-
-  input_data = THTensor_(data)(input);
-  weight_data = THTensor_(data)(kernel);
-  output_data = THTensor_(data)(r_);
-
-  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
-  {
-    /*THTensor_(zero)(r_);*/
-#pragma omp parallel for private(k)
-    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
-    {
-      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
-      long l;
-      for (l = 0; l < nOutputRows*nOutputCols; l++)
-        ptr_output[l] = 0.0;
-    }
-  }
-  else if (beta != 1)
-  {
-    /*THTensor_(mul)(r_, beta);*/
-#pragma omp parallel for private(k)
-    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
-    {
-      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
-      long l;
-      for (l = 0; l < nOutputRows*nOutputCols; l++)
-        ptr_output[l] *= beta;
-    }
-  }
-
-#pragma omp parallel for private(k)
-  for(k = 0; k < nKernelPlane; k++)
-  {
-    long i;
-    /* get kernel */
-    real *ptr_weight = weight_data+k*kstride0;
-
-    for(i = 0; i < nInputPlane; i++)
-    {
-      /* get output */
-      real *ptr_output = output_data + k*nInputPlane*nOutputCols*nOutputRows + i*nOutputCols*nOutputRows;
-      /* get input */
-      real *ptr_input = input_data+i*istride0;
-
-      /* do image, kernel convolution */
-      if (*vf == 'F')
-        if (*xc == 'X')
-          THTensor_(fullXCorr2Dptr)(ptr_output,
-                                    alpha,
-                                    ptr_input,  nInputRows,  nInputCols,
-                                    ptr_weight, nKernelRows, nKernelCols,
-                                    srow, scol);
-        else
-          THTensor_(fullConv2Dptr)(ptr_output,
-                                   alpha,
-                                   ptr_input,  nInputRows,  nInputCols,
-                                   ptr_weight, nKernelRows, nKernelCols,
-                                   srow, scol);
-      else
-        if (*xc == 'X')
-          THTensor_(validXCorr2Dptr)(ptr_output,
-                                     alpha,
-                                     ptr_input,  nInputRows,  nInputCols,
-                                     ptr_weight, nKernelRows, nKernelCols,
-                                     srow, scol);
-        else
-          THTensor_(validConv2Dptr)(ptr_output,
-                                    alpha,
-                                    ptr_input,  nInputRows,  nInputCols,
-                                    ptr_weight, nKernelRows, nKernelCols,
-                                    srow, scol);
-      /* Next output plane */
-      /* output_data += nOutputCols*nOutputRows; */
-    }
-  }
-  THTensor_(free)(input);
-  THTensor_(free)(kernel);
-}
-
-
-/*
-  3D input, 4D kernel, 3D output
-  matrix vector product like
-  y <- Ax + beta*y
-*/
-void THTensor_(conv2Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc)
-{
-  long nInputPlane, nInputRows, nInputCols;
-  long nKernelRows, nKernelCols;
-  long nOutputPlane, nOutputRows, nOutputCols;
-  long istride0, kstride0, kstride1;
-  THTensor *input;
-  THTensor* kernel;
-  real *input_data;
-  real *weight_data;
-  real *output_data;
-  ptrdiff_t nelem;
-  long k;
-
-  THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
-  THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
-  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
-  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
-  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'");
-  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'");
-
-  input = THTensor_(newContiguous)(t_);
-  if (!(k_->stride[3] == 1) || !(k_->stride[2] == k_->size[3])) {
-    kernel = THTensor_(newContiguous)(k_);
-  } else {
-    THTensor_(retain)(k_);
-    kernel = k_;
-  }
-
-  nInputPlane = input->size[0];
-  istride0    = input->stride[0];
-  nInputRows  = input->size[1];
-  nInputCols  = input->size[2];
-
-  kstride0    = kernel->stride[0];
-  kstride1    = kernel->stride[1];
-  nKernelRows = kernel->size[2];
-  nKernelCols = kernel->size[3];
-  nOutputPlane = kernel->size[0];
-  THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes");
-
-  THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dmv : Input image is smaller than kernel");
-
-  if (*vf == 'F') {
-    nOutputRows = (nInputRows - 1) * srow + nKernelRows;
-    nOutputCols = (nInputCols - 1) * scol + nKernelCols;
-  } else { /* valid */
-    nOutputRows = (nInputRows - nKernelRows) / srow + 1;
-    nOutputCols = (nInputCols - nKernelCols) / scol + 1;
-  }
-
-  nelem = THTensor_(nElement)(r_);
-  THTensor_(resize3d)(r_, nOutputPlane, nOutputRows, nOutputCols);
-
-  input_data = THTensor_(data)(input);
-  weight_data = THTensor_(data)(kernel);
-  output_data = THTensor_(data)(r_);
-
-  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
-  {
-    /*THTensor_(zero)(r_);*/
-#pragma omp parallel for private(k)
-    for (k = 0; k < r_->size[0]; k++)
-    {
-      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
-      long l;
-      for (l = 0; l < nOutputRows*nOutputCols; l++)
-        ptr_output[l] = 0.0;
-    }
-  }
-  else if (beta != 1)
-  {
-    /*THTensor_(mul)(r_, beta);*/
-#pragma omp parallel for private(k)
-    for (k = 0; k < r_->size[0]; k++)
-    {
-      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
-      long l;
-      for (l = 0; l < nOutputRows*nOutputCols; l++)
-        ptr_output[l] *= beta;
-    }
-  }
-
-#pragma omp parallel for private(k)
-  for(k = 0; k < nOutputPlane; k++)
-  {
-    long i;
-    /* get output */
-    real *ptr_output = output_data + k*nOutputCols*nOutputRows;
-    for(i = 0; i < nInputPlane; i++)
-    {
-      /* get kernel */
-      real *ptr_weight = weight_data + k*kstride0 + i*kstride1;
-      /* get input */
-      real *ptr_input = input_data + i*istride0;
-
-      /* do image, kernel convolution */
-      if (*vf == 'F')
-        if (*xc == 'X')
-          THTensor_(fullXCorr2Dptr)(ptr_output,
-                                    alpha,
-                                    ptr_input,  nInputRows,  nInputCols,
-                                    ptr_weight, nKernelRows, nKernelCols,
-                                    srow, scol);
-        else
-          THTensor_(fullConv2Dptr)(ptr_output,
-                                   alpha,
-                                   ptr_input,  nInputRows,  nInputCols,
-                                   ptr_weight, nKernelRows, nKernelCols,
-                                   srow, scol);
-      else
-        if (*xc == 'X')
-          THTensor_(validXCorr2Dptr)(ptr_output,
-                                     alpha,
-                                     ptr_input,  nInputRows,  nInputCols,
-                                     ptr_weight, nKernelRows, nKernelCols,
-                                     srow, scol);
-        else
-          THTensor_(validConv2Dptr)(ptr_output,
-                                    alpha,
-                                    ptr_input,  nInputRows,  nInputCols,
-                                    ptr_weight, nKernelRows, nKernelCols,
-                                    srow, scol);
-    }
-    /* Next output plane */
-    /* output_data += nOutputCols*nOutputRows;*/
-  }
-  THTensor_(free)(input);
-  THTensor_(free)(kernel);
-}
-
-
-/*
-  3D input, 4D kernel, 3D output
-  matrix vector product like
-  y <- Ax + beta*y
-*/
-void THTensor_(conv2Dmm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc)
-{
-  long nInputPlane, nInputRows, nInputCols;
-  long nKernelRows, nKernelCols;
-  long nOutputPlane, nOutputRows, nOutputCols;
-  long kstride0, kstride1;
-  THTensor *input;
-  THTensor* kernel;
-  long nbatch;
-  ptrdiff_t nelem;
-  real *input_data;
-  real *weight_data;
-  real *output_data;
-  long p;
-
-  THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
-  THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
-  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
-  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
-  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'");
-  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'");
-
-  input = THTensor_(newContiguous)(t_);
-  if (!(k_->stride[3] == 1) || !(k_->stride[2] == k_->size[3])) {
-    kernel = THTensor_(newContiguous)(k_);
-  } else {
-    THTensor_(retain)(k_);
-    kernel = k_;
-  }
-
-  nbatch = input->size[0];
-  nInputPlane = input->size[1];
-  nInputRows  = input->size[2];
-  nInputCols  = input->size[3];
-
-  kstride0    = kernel->stride[0];
-  kstride1    = kernel->stride[1];
-  nKernelRows = kernel->size[2];
-  nKernelCols = kernel->size[3];
-  nOutputPlane = kernel->size[0];
-  THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes");
-
-  THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dmv : Input image is smaller than kernel");
-
-  if (*vf == 'F') {
-    nOutputRows = (nInputRows - 1) * srow + nKernelRows;
-    nOutputCols = (nInputCols - 1) * scol + nKernelCols;
-  } else { /* valid */
-    nOutputRows = (nInputRows - nKernelRows) / srow + 1;
-    nOutputCols = (nInputCols - nKernelCols) / scol + 1;
-  }
-
-  nelem = THTensor_(nElement)(r_);
-  THTensor_(resize4d)(r_, nbatch, nOutputPlane, nOutputRows, nOutputCols);
-
-  input_data = THTensor_(data)(input);
-  weight_data = THTensor_(data)(kernel);
-  output_data = THTensor_(data)(r_);
-
-  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
-  {
-    /*THTensor_(zero)(r_);*/
-#pragma omp parallel for private(p)
-    for (p=0; p < r_->size[0]; p++)
-    {
-      long k;
-      for (k = 0; k < r_->size[1]; k++)
-      {
-        real* ptr_output = output_data + p*nOutputPlane*nOutputRows*nOutputCols + k*nOutputCols*nOutputRows;
-        long l;
-        for (l = 0; l < nOutputRows*nOutputCols; l++)
-          ptr_output[l] = 0.0;
-      }
-    }
-  }
-  else if (beta != 1)
-  {
-    /*THTensor_(mul)(r_, beta);*/
-#pragma omp parallel for private(p)
-    for(p=0; p < r_->size[0]; p++)
-    {
-      long k;
-      for (k = 0; k < r_->size[1]; k++)
-      {
-        real* ptr_output = output_data + p*nOutputPlane*nOutputRows*nOutputCols + k*nOutputCols*nOutputRows;
-        long l;
-        for (l = 0; l < nOutputRows*nOutputCols; l++)
-          ptr_output[l] *= beta;
-      }
-    }
-  }
-
-#pragma omp parallel for private(p)
-  for(p=0; p < nbatch; p++)
-  {
-    long k;
-    for(k = 0; k < nOutputPlane; k++)
-    {
-      long i;
-      /* get output */
-      real *ptr_output = output_data + p*nOutputPlane*nOutputCols*nOutputRows + k*nOutputCols*nOutputRows;
-      for(i = 0; i < nInputPlane; i++)
-      {
-        /* get kernel */
-        real *ptr_weight = weight_data + k*kstride0 + i*kstride1;
-        /* get input */
-        real *ptr_input = input_data + p*nInputPlane*nInputRows*nInputCols + i*nInputRows*nInputCols;
-
-        /* do image, kernel convolution */
-        if (*vf == 'F')
-          if (*xc == 'X')
-            THTensor_(fullXCorr2Dptr)(ptr_output,
-                                      alpha,
-                                      ptr_input,  nInputRows,  nInputCols,
-                                      ptr_weight, nKernelRows, nKernelCols,
-                                      srow, scol);
-          else
-            THTensor_(fullConv2Dptr)(ptr_output,
-                                     alpha,
-                                     ptr_input,  nInputRows,  nInputCols,
-                                     ptr_weight, nKernelRows, nKernelCols,
-                                     srow, scol);
-        else
-          if (*xc == 'X')
-            THTensor_(validXCorr2Dptr)(ptr_output,
-                                       alpha,
-                                       ptr_input,  nInputRows,  nInputCols,
-                                       ptr_weight, nKernelRows, nKernelCols,
-                                       srow, scol);
-          else
-            THTensor_(validConv2Dptr)(ptr_output,
-                                      alpha,
-                                      ptr_input,  nInputRows,  nInputCols,
-                                      ptr_weight, nKernelRows, nKernelCols,
-                                      srow, scol);
-      }
-      /* Next output plane */
-      /* output_data += nOutputCols*nOutputRows;*/
-    }
-  }
-  THTensor_(free)(input);
-  THTensor_(free)(kernel);
-}
-
-
-/*
-  2D input, 2D kernel, 2D output
-  scalar multiplication like
-  y <- x*y + beta*y
-*/
-void THTensor_(conv2Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc)
-{
-  THTensor *input;
-  THTensor* kernel;
-  long nInputRows;
-  long nInputCols;
-  long nKernelRows;
-  long nKernelCols;
-  long nOutputRows, nOutputCols;
-  real *ptr_input;
-  real *ptr_weight;
-  real *output_data;
-  ptrdiff_t nelem;
-
-  THArgCheck(t_->nDimension == 2 , 3, "input: 2D Tensor expected");
-  THArgCheck(k_->nDimension == 2 , 4, "kernel: 2D Tensor expected");
-  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
-  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
-
-  input = THTensor_(newContiguous)(t_);
-  kernel = THTensor_(newContiguous)(k_);
-
-  nInputRows  = input->size[0];
-  nInputCols  = input->size[1];
-  nKernelRows = kernel->size[0];
-  nKernelCols = kernel->size[1];
-
-  THArgCheck((nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dmul : Input image is smaller than kernel");
-
-  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
-  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
-
-  nelem = THTensor_(nElement)(r_);
-  THTensor_(resize2d)(r_, nOutputRows, nOutputCols);
-  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
-    THTensor_(zero)(r_);
-  else if (beta != 1)
-    THTensor_(mul)(r_, r_, beta);
-
-  ptr_input = THTensor_(data)(input);
-  ptr_weight = THTensor_(data)(kernel);
-  output_data = THTensor_(data)(r_);
-
-
-  /* do image, kernel convolution */
-  THTensor_(conv2d)(output_data,
-                    alpha,
-                    ptr_input, nInputRows, nInputCols,
-                    ptr_weight, nKernelRows, nKernelCols,
-                    srow, scol, vf, xc);
-  THTensor_(free)(input);
-  THTensor_(free)(kernel);
-}
-
-/*
-  3D input, 3D kernel, 3D output
-  component wise multiplication like
-  y <- y.*x + beta*y
-*/
-void THTensor_(conv2Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc)
-{
-  long nInputPlane, nInputRows, nInputCols;
-  long nKernelRows, nKernelCols;
-  long nOutputPlane, nOutputRows, nOutputCols;
-  long istride0, kstride0;
-  THTensor *input;
-  THTensor *kernel;
-  real *input_data;
-  real *weight_data;
-  real *output_data;
-  ptrdiff_t nelem;
-  long k;
-
-  THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
-  THArgCheck(k_->nDimension == 3 , 4, "kernel: 3D Tensor expected");
-  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
-  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
-
-  input = THTensor_(newContiguous)(t_);
-  kernel = THTensor_(newContiguous)(k_);
-
-  istride0    = input->stride[0];
-  nInputPlane = input->size[0];
-  nInputRows  = input->size[1];
-  nInputCols  = input->size[2];
-
-  kstride0    = kernel->stride[0];
-  nOutputPlane = kernel->size[0];
-  nKernelRows = kernel->size[1];
-  nKernelCols = kernel->size[2];
-
-  THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes");
-  THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dcmul : Input image is smaller than kernel");
-
-  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
-  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
-
-  nelem = THTensor_(nElement)(r_);
-  THTensor_(resize3d)(r_, nOutputPlane, nOutputRows, nOutputCols);
-
-  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
-  {
-    THTensor_(zero)(r_);
-  }
-  else if (beta != 1)
-    THTensor_(mul)(r_, r_, beta);
-
-  input_data = THTensor_(data)(input);
-  weight_data = THTensor_(data)(kernel);
-  output_data = THTensor_(data)(r_);
-
-  for(k = 0; k < nOutputPlane; k++)
-  {
-    /* get kernel */
-    real *ptr_weight = weight_data + k*kstride0;
-    /* get input */
-    real *ptr_input = input_data + k*istride0;
-
-    /* do image, kernel convolution */
-    THTensor_(conv2d)(output_data,
-                      alpha,
-                      ptr_input, nInputRows, nInputCols,
-                      ptr_weight, nKernelRows, nKernelCols,
-                      srow, scol, vf, xc);
-    /* Next output plane */
-    output_data += nOutputCols*nOutputRows;
-  }
-  THTensor_(free)(input);
-  THTensor_(free)(kernel);
-}
-
-/*
-  3D input, 3D kernel, 3D output
-  component wise multiplication like with a permutation map
-  y <- y.*x + beta*y
-*/
-void THTensor_(conv2Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, THTensor *map, long srow, long scol, const char *vf, const char *xc)
-{
-  long nInputPlane, nInputRows, nInputCols;
-  long nKernelRows, nKernelCols;
-  long nOutputPlane, nOutputRows, nOutputCols;
-  long istride0, kstride0;
-  THTensor *input;
-  THTensor* kernel;
-  real *input_data;
-  real *weight_data;
-  real *output_data;
-  long nmaps;
-  ptrdiff_t nelem;
-  long k;
-
-  THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
-  THArgCheck(k_->nDimension == 3 , 4, "kernel: 3D Tensor expected");
-  THArgCheck(map->nDimension == 2 , 4, "map: 2D Tensor expected");
-  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
-  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
-
-  input = THTensor_(newContiguous)(t_);
-  kernel = THTensor_(newContiguous)(k_);
-
-  istride0    = input->stride[0];
-  nInputPlane = input->size[0];
-  nInputRows  = input->size[1];
-  nInputCols  = input->size[2];
-
-  kstride0    = kernel->stride[0];
-  nOutputPlane = kernel->size[0];
-  nKernelRows = kernel->size[1];
-  nKernelCols = kernel->size[2];
-
-  THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes");
-  THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols)
-              || *vf == 'F', 2, "conv2Dmap : Input image is smaller than kernel");
-
-  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
-  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
-
-  nelem = THTensor_(nElement)(r_);
-  THTensor_(resize3d)(r_, nOutputPlane, nOutputRows, nOutputCols);
-
-  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
-  {
-    THTensor_(zero)(r_);
-  }
-  else if (beta != 1)
-    THTensor_(mul)(r_, r_, beta);
-
-  input_data = THTensor_(data)(input);
-  weight_data = THTensor_(data)(kernel);
-  output_data = THTensor_(data)(r_);
-
-  nmaps = map->size[0];
-
-  for(k = 0; k < nmaps; k++)
-  {
-    /* get indices */
-    long from = (long)THTensor_(get2d)(map,k,0)-1;
-    long to   = (long)THTensor_(get2d)(map,k,1)-1;
-
-    /* get kernel */
-    real *ptr_weight = weight_data + k*kstride0;
-    /* get input */
-    real *ptr_input = input_data + from*istride0;
-    /* get output */
-    real *ptr_output = output_data + to*nOutputRows*nOutputCols;
-
-    /* do image, kernel convolution */
-    THTensor_(conv2d)(ptr_output,
-                      alpha,
-                      ptr_input, nInputRows, nInputCols,
-                      ptr_weight, nKernelRows, nKernelCols,
-                      srow, scol, vf, xc);
-  }
-  THTensor_(free)(input);
-  THTensor_(free)(kernel);
-}
-
-/*
-  4D input, 4D kernel, 5D output
-  like rank1 update
-  A <- xx' + beta*A
-  for sr,sc=1 this is equivalent to xcorr2Dger, but otherwise it is useful for
-  calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
-*/
-void THTensor_(conv3DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
-                             long sdepth, long srow, long scol)
-{
-  long nInputPlane, nInputDepth, nInputRows, nInputCols;
-  long nKernelPlane, nKernelDepth, nKernelRows, nKernelCols;
-  long nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
-  long istride0, kstride0;
-  THTensor *input;
-  THTensor *kernel;
-  real *input_data;
-  real *weight_data;
-  real *output_data;
-  ptrdiff_t nelem;
-  long k, i;
-
-  THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
-  THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
-  THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer");
-  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
-  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
-
-  input = THTensor_(newContiguous)(t_);
-  kernel = THTensor_(newContiguous)(k_);
-
-  nInputPlane = input->size[0];
-  istride0    = input->stride[0];
-  nInputDepth = input->size[1];
-  nInputRows  = input->size[2];
-  nInputCols  = input->size[3];
-
-  kstride0 = kernel->stride[0];
-  nKernelPlane = kernel->size[0];
-  nKernelDepth= kernel->size[1];
-  nKernelRows = kernel->size[2];
-  nKernelCols = kernel->size[3];
-  nOutputPlane = nInputPlane * kernel->size[0];
-
-  THArgCheck(nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, "conv3DRevger : Input image is smaller than kernel");
-
-  nOutputDepth = nInputDepth - (nKernelDepth - 1) * sdepth;
-  nOutputRows = nInputRows - (nKernelRows - 1) * srow;
-  nOutputCols = nInputCols - (nKernelCols - 1) * scol;
-
-  nelem = THTensor_(nElement)(r_);
-  THTensor_(resize5d)(r_,nKernelPlane, nInputPlane, nOutputDepth, nOutputRows, nOutputCols);
-
-  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
-  {
-    THTensor_(zero)(r_);
-  }
-  else if (beta != 1)
-    THTensor_(mul)(r_, r_, beta);
-
-  input_data = THTensor_(data)(input);
-  weight_data = THTensor_(data)(kernel);
-  output_data = THTensor_(data)(r_);
-
-  for(k = 0; k < nKernelPlane; k++)
-  {
-    /* get kernel */
-    real *ptr_weight = weight_data+k*kstride0;
-
-    for(i = 0; i < nInputPlane; i++)
-    {
-      /* get input */
-      real *ptr_input = input_data+i*istride0;
-
-      /* do image, kernel convolution */
-      THTensor_(validXCorr3DRevptr)(output_data,
-                                    alpha,
-                                    ptr_input,  nInputDepth, nInputRows,  nInputCols,
-                                    ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
-                                    sdepth, srow, scol);
-      /* Next output plane */
-      output_data += nOutputDepth*nOutputCols*nOutputRows;
-    }
-  }
-  THTensor_(free)(input);
-  THTensor_(free)(kernel);
-}
-
-
-/*
-  4D input, 4D kernel, 5D output
-  like rank1 update
-  A <- xx' + beta*A
-*/
-void THTensor_(conv3Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
-                          long sdepth, long srow, long scol, const char *vf, const char *xc)
-{
-  long nInputPlane, nInputDepth, nInputRows, nInputCols;
-  long nKernelPlane, nKernelDepth, nKernelRows, nKernelCols;
-  long nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
-  long istride0, kstride0;
-  THTensor *input;
-  THTensor *kernel;
-  real *input_data;
-  real *weight_data;
-  real *output_data;
-  ptrdiff_t nelem;
-  long k, i;
-
-  THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
-  THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
-  THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer");
-  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
-  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
-  THArgCheck(*vf == 'V' || *vf == 'F', 8, "type of convolution can 'V' or 'F'");
-  THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'");
-
-  input = THTensor_(newContiguous)(t_);
-  kernel = THTensor_(newContiguous)(k_);
-
-  nInputPlane = input->size[0];
-  istride0    = input->stride[0];
-  nInputDepth = input->size[1];
-  nInputRows  = input->size[2];
-  nInputCols  = input->size[3];
-
-  kstride0     = kernel->stride[0];
-  nKernelPlane = kernel->size[0];
-  nKernelDepth = kernel->size[1];
-  nKernelRows  = kernel->size[2];
-  nKernelCols  = kernel->size[3];
-  nOutputPlane = nInputPlane * kernel->size[0];
-
-  THArgCheck((nInputDepth >= nKernelDepth
-              && nInputRows >= nKernelRows
-              && nInputCols >= nKernelCols)
-             || *vf == 'F', 2, "conv3Dger : Input image is smaller than kernel");
-
-  nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
-  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
-  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
-
-  nelem = THTensor_(nElement)(r_);
-  THTensor_(resize5d)(r_,nKernelPlane, nInputPlane, nOutputDepth, nOutputRows, nOutputCols);
-
-  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
-  {
-    THTensor_(zero)(r_);
-  }
-  else if (beta != 1)
-    THTensor_(mul)(r_, r_, beta);
-
-  input_data = THTensor_(data)(input);
-  weight_data = THTensor_(data)(kernel);
-  output_data = THTensor_(data)(r_);
-
-  for(k = 0; k < nKernelPlane; k++)
-  {
-    /* get kernel */
-    real *ptr_weight = weight_data+k*kstride0;
-
-    for(i = 0; i < nInputPlane; i++)
-    {
-      /* get input */
-      real *ptr_input = input_data+i*istride0;
-
-      /* do image, kernel convolution */
-      THTensor_(conv3d)(output_data,
-                        alpha,
-                        ptr_input,  nInputDepth, nInputRows,  nInputCols,
-                        ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
-                        sdepth, srow, scol, vf, xc);
-
-      /* Next output plane */
-      output_data += nOutputDepth*nOutputCols*nOutputRows;
-    }
-  }
-  THTensor_(free)(input);
-  THTensor_(free)(kernel);
-}
-
-/*
-  4D input, 5D kernel, 4D output
-  matrix vector product like
-  y <- Ax + beta*y
-*/
-void THTensor_(conv3Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
-                         long sdepth, long srow, long scol, const char *vf, const char *xc)
-{
-  long nInputPlane, nInputDepth, nInputRows, nInputCols;
-  long nKernelDepth, nKernelRows, nKernelCols;
-  long nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
-  long istride0, kstride0, kstride1;
-  THTensor *input;
-  THTensor *kernel;
-  real *input_data;
-  real *weight_data;
-  real *output_data;
-  ptrdiff_t nelem;
-  long k, i;
-
-  THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
-  THArgCheck(k_->nDimension == 5 , 4, "kernel: 5D Tensor expected");
-  THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer");
-  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
-  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
-  THArgCheck(*vf == 'V' || *vf == 'F', 8, "type of convolution can 'V' or 'F'");
-  THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'");
-
-  input = THTensor_(newContiguous)(t_);
-  if (!(k_->stride[4] == 1) || !(k_->stride[3] == k_->size[4])) {
-    kernel = THTensor_(newContiguous)(k_);
-  } else {
-    THTensor_(retain)(k_);
-    kernel = k_;
-  }
-
-  nInputPlane = input->size[0];
-  istride0    = input->stride[0];
-  nInputDepth = input->size[1];
-  nInputRows  = input->size[2];
-  nInputCols  = input->size[3];
-
-  kstride0    = kernel->stride[0];
-  kstride1    = kernel->stride[1];
-  nKernelDepth = kernel->size[2];
-  nKernelRows = kernel->size[3];
-  nKernelCols = kernel->size[4];
-  nOutputPlane = kernel->size[0];
-  THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes");
-
-  THArgCheck( (nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv3Dmv : Input image is smaller than kernel");
-
-  nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
-  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
-  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
-
-  nelem = THTensor_(nElement)(r_);
-  THTensor_(resize4d)(r_, nOutputPlane, nOutputDepth, nOutputRows, nOutputCols);
-
-  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
-  {
-    THTensor_(zero)(r_);
-  }
-  else if (beta != 1)
-    THTensor_(mul)(r_, r_, beta);
-
-  input_data = THTensor_(data)(input);
-  weight_data = THTensor_(data)(kernel);
-  output_data = THTensor_(data)(r_);
-
-  for(k = 0; k < nOutputPlane; k++)
-  {
-    for(i = 0; i < nInputPlane; i++)
-    {
-      /* get kernel */
-      real *ptr_weight = weight_data + k*kstride0 + i*kstride1;
-      /* get input */
-      real *ptr_input = input_data + i*istride0;
-
-      /* do image, kernel convolution */
-      THTensor_(conv3d)(output_data,
-                        alpha,
-                        ptr_input,  nInputDepth, nInputRows,  nInputCols,
-                        ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
-                        sdepth, srow, scol, vf, xc);
-    }
-    /* Next output plane */
-    output_data += nOutputDepth*nOutputCols*nOutputRows;
-  }
-  THTensor_(free)(input);
-  THTensor_(free)(kernel);
-}
-
-/*
-  3D input, 3D kernel, 3D output
-  scalar multiplication like
-  y <- x*y + beta*y
-*/
-void THTensor_(conv3Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
-                          long sdepth, long srow, long scol, const char *vf, const char *xc)
-{
-  THTensor *input;
-  THTensor* kernel;
-  long nInputDepth;
-  long nInputRows;
-  long nInputCols;
-  long nKernelDepth;
-  long nKernelRows;
-  long nKernelCols;
-  long nOutputDepth, nOutputRows, nOutputCols;
-  real *ptr_input;
-  real *ptr_weight;
-  real *output_data;
-  ptrdiff_t nelem;
-
-  THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
-  THArgCheck(k_->nDimension == 3 , 4, "kernel: 3D Tensor expected");
-  THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer");
-  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
-  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
-  THArgCheck(*vf == 'V' || *vf == 'F', 8, "type of convolution can 'V' or 'F'");
-  THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'");
-
-  input = THTensor_(newContiguous)(t_);
-  kernel = THTensor_(newContiguous)(k_);
-
-  nInputDepth = input->size[0];
-  nInputRows  = input->size[1];
-  nInputCols  = input->size[2];
-  nKernelDepth = kernel->size[0];
-  nKernelRows = kernel->size[1];
-  nKernelCols = kernel->size[2];
-
-  THArgCheck((nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv3Dmul : Input image is smaller than kernel");
-
-  nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
-  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
-  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
-
-  nelem = THTensor_(nElement)(r_);
-  THTensor_(resize3d)(r_, nOutputDepth, nOutputRows, nOutputCols);
-  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
-    THTensor_(zero)(r_);
-  else if (beta != 1)
-    THTensor_(mul)(r_, r_, beta);
-
-  ptr_input = THTensor_(data)(input);
-  ptr_weight = THTensor_(data)(kernel);
-  output_data = THTensor_(data)(r_);
-
-
-  /* do image, kernel convolution */
-  THTensor_(conv3d)(output_data,
-                    alpha,
-                    ptr_input,  nInputDepth, nInputRows,  nInputCols,
-                    ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
-                    sdepth, srow, scol, vf, xc);
-  THTensor_(free)(input);
-  THTensor_(free)(kernel);
-}
-
-/*
-  4D input, 4D kernel, 4D output
-  component wise multiplication like
-  y <- y.*x + beta*y
-*/
-void THTensor_(conv3Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
-                           long sdepth, long srow, long scol, const char *vf, const char *xc)
-{
-  long nInputPlane, nInputDepth, nInputRows, nInputCols;
-  long nKernelDepth, nKernelRows, nKernelCols;
-  long nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
-  long istride0, kstride0;
-
-  THTensor *input;
-  THTensor *kernel;
-  real *input_data;
-  real *weight_data;
-  real *output_data;
-  ptrdiff_t nelem;
-  long k;
-
-  THArgCheck(t_->nDimension == 4 , 3, "input: 3D Tensor expected");
-  THArgCheck(k_->nDimension == 4 , 4, "kernel: 3D Tensor expected");
-  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
-  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
-  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'");
-  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'");
-
-  input = THTensor_(newContiguous)(t_);
-  kernel = THTensor_(newContiguous)(k_);
-
-  istride0    = input->stride[0];
-  nInputPlane = input->size[0];
-  nInputDepth = input->size[1];
-  nInputRows  = input->size[2];
-  nInputCols  = input->size[3];
-
-  kstride0    = kernel->stride[0];
-  nOutputPlane = kernel->size[0];
-  nKernelDepth = kernel->size[1];
-  nKernelRows = kernel->size[2];
-  nKernelCols = kernel->size[3];
-
-  THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes");
-  THArgCheck( (nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv3Dcmul : Input image is smaller than kernel");
-
-  nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
-  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
-  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
-
-  nelem = THTensor_(nElement)(r_);
-  THTensor_(resize4d)(r_, nOutputPlane, nOutputDepth, nOutputRows, nOutputCols);
-
-  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
-  {
-    THTensor_(zero)(r_);
-  }
-  else if (beta != 1)
-    THTensor_(mul)(r_, r_, beta);
-
-  input_data = THTensor_(data)(input);
-  weight_data = THTensor_(data)(kernel);
-  output_data = THTensor_(data)(r_);
-
-  for(k = 0; k < nOutputPlane; k++)
-  {
-    /* get kernel */
-    real *ptr_weight = weight_data + k*kstride0;
-    /* get input */
-    real *ptr_input = input_data + k*istride0;
-
-    /* do image, kernel convolution */
-    THTensor_(conv3d)(output_data,
-                      alpha,
-                      ptr_input,  nInputDepth, nInputRows,  nInputCols,
-                      ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
-                      sdepth, srow, scol, vf, xc);
-
-    /* Next output plane */
-    output_data += nOutputDepth*nOutputCols*nOutputRows;
-  }
-  THTensor_(free)(input);
-  THTensor_(free)(kernel);
-}
-
-/*
-  4D input, 4D kernel, 4D output
-  component wise multiplication like with a permutation map
-  y <- y.*x + beta*y
-*/
-void THTensor_(conv3Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, THTensor *map,
-                          long sdepth, long srow, long scol, const char *vf, const char *xc)
-{
-  long nInputPlane, nInputDepth, nInputRows, nInputCols;
-  long nKernelDepth, nKernelRows, nKernelCols;
-  long nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
-  long istride0, kstride0;
-
-  THTensor *input;
-  THTensor *kernel;
-  ptrdiff_t nelem;
-  real *input_data;
-  real *weight_data;
-  real *output_data;
-  long nmaps;
-  long k;
-
-  THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
-  THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
-  THArgCheck(map->nDimension == 2 , 4, "map: 2D Tensor expected");
-  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
-  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
-  THArgCheck(*vf == 'V' || *vf == 'F', 8, "type of convolution can 'V' or 'F'");
-  THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'");
-
-  input = THTensor_(newContiguous)(t_);
-  kernel = THTensor_(newContiguous)(k_);
-
-  istride0    = input->stride[0];
-  nInputPlane = input->size[0];
-  nInputDepth = input->size[1];
-  nInputRows  = input->size[2];
-  nInputCols  = input->size[3];
-
-  kstride0    = kernel->stride[0];
-  nOutputPlane = kernel->size[0];
-  nKernelDepth = kernel->size[1];
-  nKernelRows = kernel->size[2];
-  nKernelCols = kernel->size[3];
-
-  THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes");
-  THArgCheck((nInputDepth >= nKernelDepth
-              && nInputRows >= nKernelRows
-              && nInputCols >= nKernelCols) || *vf == 'F',
-             2, "conv3Dmap : Input image is smaller than kernel");
-
-  nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
-  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
-  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);
-
-  nelem = THTensor_(nElement)(r_);
-  THTensor_(resize4d)(r_, nOutputPlane, nOutputDepth, nOutputRows, nOutputCols);
-
-  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
-  {
-    THTensor_(zero)(r_);
-  }
-  else if (beta != 1)
-    THTensor_(mul)(r_, r_, beta);
-
-  input_data = THTensor_(data)(input);
-  weight_data = THTensor_(data)(kernel);
-  output_data = THTensor_(data)(r_);
-
-  nmaps = map->size[0];
-
-  for(k = 0; k < nmaps; k++)
-  {
-    /* get indices */
-    long from = (long)THTensor_(get2d)(map,k,0)-1;
-    long to   = (long)THTensor_(get2d)(map,k,1)-1;
-
-    /* get kernel */
-    real *ptr_weight = weight_data + k*kstride0;
-    /* get input */
-    real *ptr_input = input_data + from*istride0;
-    /* get output */
-    real *ptr_output = output_data + to*nOutputDepth*nOutputRows*nOutputCols;
-
-    /* do image, kernel convolution */
-    THTensor_(conv3d)(ptr_output,
-                      alpha,
-                      ptr_input,  nInputDepth, nInputRows,  nInputCols,
-                      ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
-                      sdepth, srow, scol, vf, xc);
-  }
-  THTensor_(free)(input);
-  THTensor_(free)(kernel);
-}
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/THTensorConv.h b/contrib/lua-torch/torch7/lib/TH/generic/THTensorConv.h
deleted file mode 100644
index 79866f390..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/THTensorConv.h
+++ /dev/null
@@ -1,79 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/THTensorConv.h"
-#else
-
-TH_API void THTensor_(validXCorr2Dptr)(real *r_,
-                                    real alpha,
-                                    real *t_, long ir, long ic,
-                                    real *k_, long kr, long kc,
-                                    long sr, long sc);
-
-TH_API void THTensor_(validConv2Dptr)(real *r_,
-                                   real alpha,
-                                   real *t_, long ir, long ic,
-                                   real *k_, long kr, long kc,
-                                   long sr, long sc);
-
-TH_API void THTensor_(fullXCorr2Dptr)(real *r_,
-                                   real alpha,
-                                   real *t_, long ir, long ic,
-                                   real *k_, long kr, long kc,
-                                   long sr, long sc);
-
-TH_API void THTensor_(fullConv2Dptr)(real *r_,
-                                  real alpha,
-                                  real *t_, long ir, long ic,
-                                  real *k_, long kr, long kc,
-                                  long sr, long sc);
-
-TH_API void THTensor_(validXCorr2DRevptr)(real *r_,
-                                       real alpha,
-                                       real *t_, long ir, long ic,
-                                       real *k_, long kr, long kc,
-                                       long sr, long sc);
-
-TH_API void THTensor_(conv2DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol);
-TH_API void THTensor_(conv2DRevgerm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol);
-TH_API void THTensor_(conv2Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc);
-TH_API void THTensor_(conv2Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc);
-TH_API void THTensor_(conv2Dmm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc);
-TH_API void THTensor_(conv2Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc);
-TH_API void THTensor_(conv2Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc);
-
-TH_API void THTensor_(validXCorr3Dptr)(real *r_,
-                                    real alpha,
-                                    real *t_, long it, long ir, long ic,
-                                    real *k_, long kt, long kr, long kc,
-                                    long st, long sr, long sc);
-
-TH_API void THTensor_(validConv3Dptr)(real *r_,
-                                   real alpha,
-                                   real *t_, long it, long ir, long ic,
-                                   real *k_, long kt, long kr, long kc,
-                                   long st, long sr, long sc);
-
-TH_API void THTensor_(fullXCorr3Dptr)(real *r_,
-                                   real alpha,
-                                   real *t_, long it, long ir, long ic,
-                                   real *k_, long kt, long kr, long kc,
-                                   long st, long sr, long sc);
-
-TH_API void THTensor_(fullConv3Dptr)(real *r_,
-                                  real alpha,
-                                  real *t_, long it, long ir, long ic,
-                                  real *k_, long kt, long kr, long kc,
-                                  long st, long sr, long sc);
-
-TH_API void THTensor_(validXCorr3DRevptr)(real *r_,
-                                       real alpha,
-                                       real *t_, long it, long ir, long ic,
-                                       real *k_, long kt, long kr, long kc,
-                                       long st, long sr, long sc);
-
-TH_API void THTensor_(conv3DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long sdepth, long srow, long scol);
-TH_API void THTensor_(conv3Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long sdepth, long srow, long scol, const char *vf, const char *xc);
-TH_API void THTensor_(conv3Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long sdepth, long srow, long scol, const char *vf, const char *xc);
-TH_API void THTensor_(conv3Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long sdepth, long srow, long scol, const char *vf, const char *xc);
-TH_API void THTensor_(conv3Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long sdepth, long srow, long scol, const char *vf, const char *xc);
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/THTensorCopy.c b/contrib/lua-torch/torch7/lib/TH/generic/THTensorCopy.c
deleted file mode 100644
index d9cd1c0d5..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/THTensorCopy.c
+++ /dev/null
@@ -1,136 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/THTensorCopy.c"
-#else
-
-int THTensor_(copyTransposeValid)(THTensor *tensor, THTensor *src) {
-  const int MIN_SZ = 60 * 60;
-  return THTensor_(isContiguous)(tensor) &&
-         THTensor_(nDimension)(src) == 2 &&
-         THTensor_(stride)(src, 0) == 1 &&
-         THTensor_(stride)(src, 1) == THTensor_(size)(src, 0) &&
-         THTensor_(nElement)(tensor) >= MIN_SZ;
-}
-
-// special case copy where tensor is contiguous and src is a transposed matrix
-// This can be generalized to most copies, but it's tricker
-void THTensor_(copyTranspose)(THTensor *tensor, THTensor *src) {
-  #define MIN(x, y) (((x) < (y)) ? (x) : (y))
-  #define MAX(x, y) (((x) > (y)) ? (x) : (y))
-
-#ifdef TH_REAL_IS_BYTE
-  const int BLOCK_SZ = 120;
-#else
-  const int BLOCK_SZ = 60;
-#endif
-
-  THTensor *buf = THTensor_(newWithSize2d)(BLOCK_SZ, BLOCK_SZ);
-  real *sp = THTensor_(data)(src);
-  real *rp = THTensor_(data)(tensor);
-  real *bp = THTensor_(data)(buf);
-
-  long NR = THTensor_(size)(src, 0);
-  long NC = THTensor_(size)(src, 1);
-  for (long R = 0; R < NR; R += BLOCK_SZ) {
-    for (long C = 0; C < NC; C += BLOCK_SZ) {
-      real *spo = sp + R + C * NR;
-      real *rpo = rp + C + R * NC;
-
-      int nr = MIN(NR - R, BLOCK_SZ);
-      int nc = MIN(NC - C, BLOCK_SZ);
-
-      // 1. copy columns from src to buf
-      for (int c = 0; c < nc; c++) {
-        memcpy(bp + c * BLOCK_SZ, spo + c * NR, nr * sizeof(real));
-      }
-
-      // 2. transpose buf in place
-      int rc_max = MAX(nr, nc);
-      int rc_min = MIN(nr, nc);
-      for (int r = 0; r < rc_max; r++) {
-        int end = MIN(r, rc_min);
-        for (int c = 0; c < end; c++) {
-          real tmp = bp[r + BLOCK_SZ * c];
-          bp[r + BLOCK_SZ * c] = bp[r * BLOCK_SZ + c];
-          bp[r * BLOCK_SZ + c] = tmp;
-        }
-      }
-
-      // 3. copy rows from buf to dst
-      for (int r = 0; r < nr; r++) {
-        memcpy(rpo + r * NC, bp + r * BLOCK_SZ, nc * sizeof(real));
-      }
-    }
-  }
-  THTensor_(free)(buf);
-  #undef MIN
-  #undef MAX
-}
-
-void THTensor_(copy)(THTensor *tensor, THTensor *src)
-{
-  if (tensor == src) return;
-  if (THTensor_(isContiguous)(tensor) && THTensor_(isContiguous)(src) && THTensor_(nElement)(tensor) == THTensor_(nElement)(src)) {
-    real *sp = THTensor_(data)(src);
-    real *rp = THTensor_(data)(tensor);
-    ptrdiff_t sz = THTensor_(nElement)(tensor);
-#ifndef TH_REAL_IS_HALF
-    THVector_(copy)(rp, sp, sz);
-#else
-    memcpy(rp, sp, sz * sizeof(real));
-#endif
-#ifndef TH_REAL_IS_HALF
-  } else if (THTensor_(copyTransposeValid)(tensor, src)) {
-    THTensor_(copyTranspose)(tensor, src);
-#endif
-  } else {
-    TH_TENSOR_APPLY2(real, tensor, real, src, *tensor_data = *src_data;)
-  }
-}
-
-#define IMPLEMENT_THTensor_COPY(TYPENAMESRC, TYPE_SRC) \
-void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \
-{ \
-  TH_TENSOR_APPLY2(real, tensor, TYPE_SRC, src, *tensor_data = (real)(*src_data);) \
-}
-
-#define IMPLEMENT_THTensor_COPY_TO_HALF(TYPENAMESRC, TYPE_SRC) \
-void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \
-{ \
- TH_TENSOR_APPLY2(real, tensor, TYPE_SRC, src, *tensor_data = TH_float2half((float)*src_data);) \
-}
-
-#define IMPLEMENT_THTensor_COPY_FROM_HALF(TYPENAMESRC, TYPE_SRC) \
-void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \
-{ \
- TH_TENSOR_APPLY2(real, tensor, TYPE_SRC, src, *tensor_data = (real)TH_half2float(*src_data);) \
-}
-
-#define IMPLEMENT_THTensor_COPY_TO_FROM_HALF(TYPENAMESRC, TYPE_SRC) \
-void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \
-{ \
- TH_TENSOR_APPLY2(real, tensor, TYPE_SRC, src, *tensor_data = *src_data;) \
-}
-
-#ifndef TH_REAL_IS_HALF
-IMPLEMENT_THTensor_COPY(Byte, unsigned char)
-IMPLEMENT_THTensor_COPY(Char, char)
-IMPLEMENT_THTensor_COPY(Short, short)
-IMPLEMENT_THTensor_COPY(Int, int)
-IMPLEMENT_THTensor_COPY(Long, long)
-IMPLEMENT_THTensor_COPY(Float, float)
-IMPLEMENT_THTensor_COPY(Double, double)
-IMPLEMENT_THTensor_COPY_FROM_HALF(Half, THHalf)
-#else
-/* only allow pass-through for Half */
-IMPLEMENT_THTensor_COPY_TO_FROM_HALF(Half, THHalf)
-IMPLEMENT_THTensor_COPY_TO_HALF(Byte, unsigned char)
-IMPLEMENT_THTensor_COPY_TO_HALF(Char, char)
-IMPLEMENT_THTensor_COPY_TO_HALF(Short, short)
-IMPLEMENT_THTensor_COPY_TO_HALF(Int, int)
-IMPLEMENT_THTensor_COPY_TO_HALF(Long, long)
-IMPLEMENT_THTensor_COPY_TO_HALF(Float, float)
-IMPLEMENT_THTensor_COPY_TO_HALF(Double, double)
-
-#endif /* REAL_IS_HALF */
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/THTensorCopy.h b/contrib/lua-torch/torch7/lib/TH/generic/THTensorCopy.h
deleted file mode 100644
index b9e5bfc99..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/THTensorCopy.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/THTensorCopy.h"
-#else
-
-/* Support for copy between different Tensor types */
-
-TH_API void THTensor_(copy)(THTensor *tensor, THTensor *src);
-TH_API void THTensor_(copyByte)(THTensor *tensor, struct THByteTensor *src);
-TH_API void THTensor_(copyChar)(THTensor *tensor, struct THCharTensor *src);
-TH_API void THTensor_(copyShort)(THTensor *tensor, struct THShortTensor *src);
-TH_API void THTensor_(copyInt)(THTensor *tensor, struct THIntTensor *src);
-TH_API void THTensor_(copyLong)(THTensor *tensor, struct THLongTensor *src);
-TH_API void THTensor_(copyFloat)(THTensor *tensor, struct THFloatTensor *src);
-TH_API void THTensor_(copyDouble)(THTensor *tensor, struct THDoubleTensor *src);
-TH_API void THTensor_(copyHalf)(THTensor *tensor, struct THHalfTensor *src);
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/THTensorLapack.c b/contrib/lua-torch/torch7/lib/TH/generic/THTensorLapack.c
deleted file mode 100644
index d4e52f6d7..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/THTensorLapack.c
+++ /dev/null
@@ -1,1121 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/THTensorLapack.c"
-#else
-
-/*
-Check if self is transpose of a contiguous matrix
-*/
-static int THTensor_(isTransposedContiguous)(THTensor *self)
-{
-  return self->stride[0] == 1 && self->stride[1] == self->size[0];
-}
-/*
-If a matrix is a regular contiguous matrix, make sure it is transposed
-because this is what we return from Lapack calls.
-*/
-static void THTensor_(checkTransposed)(THTensor *self)
-{
-  if(THTensor_(isContiguous)(self))
-    THTensor_(transpose)(self, NULL, 0, 1);
-  return;
-}
-/*
-newContiguous followed by transpose
-Similar to (newContiguous), but checks if the transpose of the matrix
-is contiguous and also limited to 2D matrices.
-*/
-static THTensor *THTensor_(newTransposedContiguous)(THTensor *self)
-{
-  THTensor *tensor;
-  if(THTensor_(isTransposedContiguous)(self))
-  {
-    THTensor_(retain)(self);
-    tensor = self;
-  }
-  else
-  {
-    tensor = THTensor_(newContiguous)(self);
-    THTensor_(transpose)(tensor, NULL, 0, 1);
-  }
-
-  return tensor;
-}
-
-/*
-Given the result tensor and src tensor, decide if the lapack call should use the
-provided result tensor or should allocate a new space to put the result in.
-
-The returned tensor have to be freed by the calling function.
-
-nrows is required, because some lapack calls, require output space smaller than
-input space, like underdetermined gels.
-*/
-static THTensor *THTensor_(checkLapackClone)(THTensor *result, THTensor *src, int nrows)
-{
-  /* check if user wants to reuse src and if it is correct shape/size */
-  if (src == result && THTensor_(isTransposedContiguous)(src) && src->size[1] == nrows)
-    THTensor_(retain)(result);
-  else if(src == result || result == NULL) /* in this case, user wants reuse of src, but its structure is not OK */
-    result = THTensor_(new)();
-  else
-    THTensor_(retain)(result);
-  return result;
-}
-
-/*
-Same as cloneColumnMajor, but accepts nrows argument, because some lapack calls require
-the resulting tensor to be larger than src.
-*/
-static THTensor *THTensor_(cloneColumnMajorNrows)(THTensor *self, THTensor *src, int nrows)
-{
-  THTensor *result;
-  THTensor *view;
-
-  if (src == NULL)
-    src = self;
-  result = THTensor_(checkLapackClone)(self, src, nrows);
-  if (src == result)
-    return result;
-
-  THTensor_(resize2d)(result, src->size[1], nrows);
-  THTensor_(checkTransposed)(result);
-
-  if (src->size[0] == nrows)
-    THTensor_(copy)(result, src);
-  else
-  {
-    view = THTensor_(newNarrow)(result, 0, 0, src->size[0]);
-    THTensor_(copy)(view, src);
-    THTensor_(free)(view);
-  }
-  return result;
-}
-
-/*
-Create a clone of src in self column major order for use with Lapack.
-If src == self, a new tensor is allocated, in any case, the return tensor should be
-freed by calling function.
-*/
-static THTensor *THTensor_(cloneColumnMajor)(THTensor *self, THTensor *src)
-{
-  return THTensor_(cloneColumnMajorNrows)(self, src, src->size[0]);
-}
-
-void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
-{
-  int free_b = 0;
-  if (a == NULL) a = ra_;
-  if (b == NULL) b = rb_;
-  THArgCheck(a->nDimension == 2, 2, "A should have 2 dimensions, but has %d",
-      a->nDimension);
-  THArgCheck(b->nDimension == 1 || b->nDimension == 2, 1, "B should have 1 or 2 "
-      "dimensions, but has %d", b->nDimension);
-  THArgCheck(a->size[0] == a->size[1], 2, "A should be square, but is %ldx%ld",
-      a->size[0], a->size[1]);
-  THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld "
-      "rows, B has %ld", a->size[0], b->size[0]);
-
-  if (b->nDimension == 1) {
-    b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0],
-            b->stride[0], 1, 0);
-    free_b = 1;
-  }
-
-  int n, nrhs, lda, ldb, info;
-  THIntTensor *ipiv;
-  THTensor *ra__;  // working version of A matrix to be passed into lapack GELS
-  THTensor *rb__;  // working version of B matrix to be passed into lapack GELS
-
-  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
-  rb__ = THTensor_(cloneColumnMajor)(rb_, b);
-
-  n    = (int)ra__->size[0];
-  nrhs = (int)rb__->size[1];
-  lda  = n;
-  ldb  = n;
-
-  ipiv = THIntTensor_newWithSize1d((long)n);
-  THLapack_(gesv)(n, nrhs,
-		  THTensor_(data)(ra__), lda, THIntTensor_data(ipiv),
-		  THTensor_(data)(rb__), ldb, &info);
-
-  THLapackCheckWithCleanup("Lapack Error in %s : U(%d,%d) is zero, singular U.",
-                           THCleanup(
-                               THTensor_(free)(ra__);
-                               THTensor_(free)(rb__);
-                               THIntTensor_free(ipiv);
-                               if (free_b) THTensor_(free)(b);),
-                           "gesv", info, info);
-
-  THTensor_(freeCopyTo)(ra__, ra_);
-  THTensor_(freeCopyTo)(rb__, rb_);
-  THIntTensor_free(ipiv);
-  if (free_b) THTensor_(free)(b);
-}
-
-void THTensor_(trtrs)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a,
-                      const char *uplo, const char *trans, const char *diag)
-{
-  int free_b = 0;
-  if (a == NULL) a = ra_;
-  if (b == NULL) b = rb_;
-  THArgCheck(a->nDimension == 2, 2, "A should have 2 dimensions, but has %d",
-      a->nDimension);
-  THArgCheck(b->nDimension == 1 || b->nDimension == 2, 1, "B should have 1 or 2 "
-      "dimensions, but has %d", b->nDimension);
-  THArgCheck(a->size[0] == a->size[1], 2, "A should be square, but is %ldx%ld",
-      a->size[0], a->size[1]);
-  THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld "
-      "rows, B has %ld", a->size[0], b->size[0]);
-
-  if (b->nDimension == 1) {
-    b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0],
-            b->stride[0], 1, 0);
-    free_b = 1;
-  }
-
-  int n, nrhs, lda, ldb, info;
-  THTensor *ra__; // working version of A matrix to be passed into lapack TRTRS
-  THTensor *rb__; // working version of B matrix to be passed into lapack TRTRS
-
-  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
-  rb__ = THTensor_(cloneColumnMajor)(rb_, b);
-
-  n    = (int)ra__->size[0];
-  nrhs = (int)rb__->size[1];
-  lda  = n;
-  ldb  = n;
-
-  THLapack_(trtrs)(uplo[0], trans[0], diag[0], n, nrhs,
-                   THTensor_(data)(ra__), lda,
-                   THTensor_(data)(rb__), ldb, &info);
-
-
-  THLapackCheckWithCleanup("Lapack Error in %s : A(%d,%d) is zero, singular A",
-                           THCleanup(
-                              THTensor_(free)(ra__);
-                              THTensor_(free)(rb__);
-                              if (free_b) THTensor_(free)(b);),
-                           "trtrs", info, info);
-
-  THTensor_(freeCopyTo)(ra__, ra_);
-  THTensor_(freeCopyTo)(rb__, rb_);
-  if (free_b) THTensor_(free)(b);
-}
-
-void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)
-{
-  int free_b = 0;
-  // Note that a = NULL is interpreted as a = ra_, and b = NULL as b = rb_.
-  if (a == NULL) a = ra_;
-  if (b == NULL) b = rb_;
-  THArgCheck(a->nDimension == 2, 2, "A should have 2 dimensions, but has %d",
-      a->nDimension);
-  THArgCheck(b->nDimension == 1 || b->nDimension == 2, 1, "B should have 1 or 2 "
-      "dimensions, but has %d", b->nDimension);
-  THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld "
-      "rows, B has %ld", a->size[0], b->size[0]);
-
-  if (b->nDimension == 1) {
-    b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0],
-            b->stride[0], 1, 0);
-    free_b = 1;
-  }
-
-  int m, n, nrhs, lda, ldb, info, lwork;
-  THTensor *work = NULL;
-  real wkopt = 0;
-
-  THTensor *ra__ = NULL;  // working version of A matrix to be passed into lapack GELS
-  THTensor *rb__ = NULL;  // working version of B matrix to be passed into lapack GELS
-
-  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
-
-  m = ra__->size[0];
-  n = ra__->size[1];
-  lda = m;
-  ldb = (m > n) ? m : n;
-
-  rb__ = THTensor_(cloneColumnMajorNrows)(rb_, b, ldb);
-
-  nrhs = rb__->size[1];
-  info = 0;
-
-
-  /* get optimal workspace size */
-  THLapack_(gels)('N', m, n, nrhs, THTensor_(data)(ra__), lda,
-		  THTensor_(data)(rb__), ldb,
-		  &wkopt, -1, &info);
-  lwork = (int)wkopt;
-  work = THTensor_(newWithSize1d)(lwork);
-  THLapack_(gels)('N', m, n, nrhs, THTensor_(data)(ra__), lda,
-		  THTensor_(data)(rb__), ldb,
-		  THTensor_(data)(work), lwork, &info);
-
-  THLapackCheckWithCleanup("Lapack Error in %s : The %d-th diagonal element of the triangular factor of A is zero",
-                           THCleanup(THTensor_(free)(ra__);
-                                     THTensor_(free)(rb__);
-                                     THTensor_(free)(work);
-                                     if (free_b) THTensor_(free)(b);),
-                           "gels", info,"");
-
-  /* rb__ is currently ldb by nrhs; resize it to n by nrhs */
-  rb__->size[0] = n;
-  if (rb__ != rb_)
-    THTensor_(resize2d)(rb_, n, nrhs);
-
-  THTensor_(freeCopyTo)(ra__, ra_);
-  THTensor_(freeCopyTo)(rb__, rb_);
-  THTensor_(free)(work);
-  if (free_b) THTensor_(free)(b);
-}
-
-void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *jobvr)
-{
-  int n, lda, lwork, info, ldvr;
-  THTensor *work, *wi, *wr, *a;
-  real wkopt;
-  real *rv_data;
-  long i;
-
-  THTensor *re__ = NULL;
-  THTensor *rv__ = NULL;
-
-  THArgCheck(a_->nDimension == 2, 1, "A should be 2 dimensional");
-  THArgCheck(a_->size[0] == a_->size[1], 1,"A should be square");
-
-  /* we want to definitely clone a_ for geev*/
-  a = THTensor_(cloneColumnMajor)(NULL, a_);
-
-  n = a->size[0];
-  lda = n;
-
-  wi = THTensor_(newWithSize1d)(n);
-  wr = THTensor_(newWithSize1d)(n);
-
-  rv_data = NULL;
-  ldvr = 1;
-  if (*jobvr == 'V')
-  {
-    THTensor_(resize2d)(rv_,n,n);
-    /* guard against someone passing a correct size, but wrong stride */
-    rv__ = THTensor_(newTransposedContiguous)(rv_);
-    rv_data = THTensor_(data)(rv__);
-    ldvr = n;
-  }
-  THTensor_(resize2d)(re_,n,2);
-  re__ = THTensor_(newContiguous)(re_);
-
-  /* get optimal workspace size */
-  THLapack_(geev)('N', jobvr[0], n, THTensor_(data)(a), lda, THTensor_(data)(wr), THTensor_(data)(wi),
-      NULL, 1, rv_data, ldvr, &wkopt, -1, &info);
-
-  lwork = (int)wkopt;
-  work = THTensor_(newWithSize1d)(lwork);
-
-  THLapack_(geev)('N', jobvr[0], n, THTensor_(data)(a), lda, THTensor_(data)(wr), THTensor_(data)(wi),
-      NULL, 1, rv_data, ldvr, THTensor_(data)(work), lwork, &info);
-
-  THLapackCheckWithCleanup(" Lapack Error in %s : %d off-diagonal elements of an didn't converge to zero",
-                           THCleanup(THTensor_(free)(re__);
-                                     THTensor_(free)(rv__);
-                                     THTensor_(free)(a);
-                                     THTensor_(free)(wi);
-                                     THTensor_(free)(wr);
-                                     THTensor_(free)(work);),
-                           "geev", info,"");
-
-  {
-    real *re_data = THTensor_(data)(re__);
-    real *wi_data = THTensor_(data)(wi);
-    real *wr_data = THTensor_(data)(wr);
-    for (i=0; i<n; i++)
-    {
-      re_data[2*i] = wr_data[i];
-      re_data[2*i+1] = wi_data[i];
-    }
-  }
-
-  if (*jobvr == 'V')
-  {
-    THTensor_(checkTransposed)(rv_);
-    THTensor_(freeCopyTo)(rv__, rv_);
-  }
-  THTensor_(freeCopyTo)(re__, re_);
-  THTensor_(free)(a);
-  THTensor_(free)(wi);
-  THTensor_(free)(wr);
-  THTensor_(free)(work);
-}
-
-void THTensor_(syev)(THTensor *re_, THTensor *rv_, THTensor *a, const char *jobz, const char *uplo)
-{
-  if (a == NULL) a = rv_;
-  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
-  THArgCheck(a->size[0] == a->size[1], 1,"A should be square");
-
-  int n, lda, lwork, info;
-  THTensor *work;
-  real wkopt;
-
-  THTensor *rv__ = NULL;
-  THTensor *re__ = NULL;
-
-  rv__ = THTensor_(cloneColumnMajor)(rv_, a);
-
-  n = rv__->size[0];
-  lda = n;
-
-  THTensor_(resize1d)(re_,n);
-  re__ = THTensor_(newContiguous)(re_);
-
-  /* get optimal workspace size */
-  THLapack_(syev)(jobz[0], uplo[0], n, THTensor_(data)(rv__), lda,
-		  THTensor_(data)(re_), &wkopt, -1, &info);
-  lwork = (int)wkopt;
-  work = THTensor_(newWithSize1d)(lwork);
-  THLapack_(syev)(jobz[0], uplo[0], n, THTensor_(data)(rv__), lda,
-		  THTensor_(data)(re_), THTensor_(data)(work), lwork, &info);
-
-  THLapackCheckWithCleanup("Lapack Error %s : %d off-diagonal elements didn't converge to zero",
-                           THCleanup(THTensor_(free)(rv__);
-                                     THTensor_(free)(re__);
-                                     THTensor_(free)(work);),
-                           "syev", info,"");
-
-  THTensor_(freeCopyTo)(rv__, rv_);
-  THTensor_(freeCopyTo)(re__, re_);
-  THTensor_(free)(work);
-}
-
-void THTensor_(gesvd)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *a, const char* jobu)
-{
-  THTensor *ra_ = THTensor_(new)();
-  THTensor_(gesvd2)(ru_, rs_, rv_,  ra_, a, jobu);
-  THTensor_(free)(ra_);
-}
-
-void THTensor_(gesvd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra_, THTensor *a, const char* jobu)
-{
-  if (a == NULL) a = ra_;
-  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
-
-  int k,m, n, lda, ldu, ldvt, lwork, info;
-  THTensor *work;
-  THTensor *rvf_ = THTensor_(new)();
-  real wkopt;
-
-  THTensor *ra__ = NULL;
-  THTensor *ru__ = NULL;
-  THTensor *rs__ = NULL;
-  THTensor *rv__ = NULL;
-
-  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
-
-  m = ra__->size[0];
-  n = ra__->size[1];
-  k = (m < n ? m : n);
-
-  lda = m;
-  ldu = m;
-  ldvt = n;
-
-  THTensor_(resize1d)(rs_,k);
-  THTensor_(resize2d)(rvf_,ldvt,n);
-  if (*jobu == 'A')
-    THTensor_(resize2d)(ru_,m,ldu);
-  else
-    THTensor_(resize2d)(ru_,k,ldu);
-
-  THTensor_(checkTransposed)(ru_);
-
-  /* guard against someone passing a correct size, but wrong stride */
-  ru__ = THTensor_(newTransposedContiguous)(ru_);
-  rs__ = THTensor_(newContiguous)(rs_);
-  rv__ = THTensor_(newContiguous)(rvf_);
-
-  THLapack_(gesvd)(jobu[0],jobu[0],
-		   m,n,THTensor_(data)(ra__),lda,
-		   THTensor_(data)(rs__),
-		   THTensor_(data)(ru__),
-		   ldu,
-		   THTensor_(data)(rv__), ldvt,
-		   &wkopt, -1, &info);
-  lwork = (int)wkopt;
-  work = THTensor_(newWithSize1d)(lwork);
-  THLapack_(gesvd)(jobu[0],jobu[0],
-		   m,n,THTensor_(data)(ra__),lda,
-		   THTensor_(data)(rs__),
-		   THTensor_(data)(ru__),
-		   ldu,
-		   THTensor_(data)(rv__), ldvt,
-		   THTensor_(data)(work),lwork, &info);
-
-  THLapackCheckWithCleanup(" Lapack Error %s : %d superdiagonals failed to converge.",
-                           THCleanup(
-                               THTensor_(free)(ru__);
-                               THTensor_(free)(rs__);
-                               THTensor_(free)(rv__);
-                               THTensor_(free)(ra__);
-                               THTensor_(free)(work);),
-                           "gesvd", info,"");
-
-  if (*jobu == 'S')
-    THTensor_(narrow)(rv__,NULL,1,0,k);
-
-  THTensor_(freeCopyTo)(ru__, ru_);
-  THTensor_(freeCopyTo)(rs__, rs_);
-  THTensor_(freeCopyTo)(rv__, rvf_);
-  THTensor_(freeCopyTo)(ra__, ra_);
-  THTensor_(free)(work);
-
-  if (*jobu == 'S') {
-    THTensor_(narrow)(rvf_,NULL,1,0,k);
-  }
-  THTensor_(resizeAs)(rv_, rvf_);
-  THTensor_(copy)(rv_, rvf_);
-  THTensor_(free)(rvf_);
-}
-
-void THTensor_(getri)(THTensor *ra_, THTensor *a)
-{
-  if (a == NULL) a = ra_;
-  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
-  THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
-
-  int m, n, lda, info, lwork;
-  real wkopt;
-  THIntTensor *ipiv;
-  THTensor *work;
-  THTensor *ra__ = NULL;
-
-  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
-
-  m = ra__->size[0];
-  n = ra__->size[1];
-  lda = m;
-  ipiv = THIntTensor_newWithSize1d((long)m);
-
-  /* Run LU */
-  THLapack_(getrf)(n, n, THTensor_(data)(ra__), lda, THIntTensor_data(ipiv), &info);
-  THLapackCheckWithCleanup("Lapack Error %s : U(%d,%d) is 0, U is singular",
-                           THCleanup(
-                               THTensor_(free)(ra__);
-                               THIntTensor_free(ipiv);),
-                           "getrf", info, info);
-
-  /* Run inverse */
-  THLapack_(getri)(n, THTensor_(data)(ra__), lda, THIntTensor_data(ipiv), &wkopt, -1, &info);
-  lwork = (int)wkopt;
-  work = THTensor_(newWithSize1d)(lwork);
-  THLapack_(getri)(n, THTensor_(data)(ra__), lda, THIntTensor_data(ipiv), THTensor_(data)(work), lwork, &info);
-  THLapackCheckWithCleanup("Lapack Error %s : U(%d,%d) is 0, U is singular",
-                           THCleanup(
-                               THTensor_(free)(ra__);
-                               THTensor_(free)(work);
-                               THIntTensor_free(ipiv);),
-                           "getri", info, info);
-
-  THTensor_(freeCopyTo)(ra__, ra_);
-  THTensor_(free)(work);
-  THIntTensor_free(ipiv);
-}
-
-void THTensor_(clearUpLoTriangle)(THTensor *a, const char *uplo)
-{
-  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
-  THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
-
-  int n = a->size[0];
-
-  /* Build full matrix */
-  real *p = THTensor_(data)(a);
-  long i, j;
-
-  /* Upper Triangular Case */
-  if (uplo[0] == 'U')
-  {
-    /* Clear lower triangle (excluding diagonals) */
-    for (i=0; i<n; i++) {
-     for (j=i+1; j<n; j++) {
-        p[n*i + j] = 0;
-      }
-    }
-  }
-  /* Lower Triangular Case */
-  else if (uplo[0] == 'L')
-  {
-    /* Clear upper triangle (excluding diagonals) */
-    for (i=0; i<n; i++) {
-      for (j=0; j<i; j++) {
-        p[n*i + j] = 0;
-      }
-    }
-  }
-}
-
-void THTensor_(copyUpLoTriangle)(THTensor *a, const char *uplo)
-{
-  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
-  THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
-
-  int n = a->size[0];
-
-  /* Build full matrix */
-  real *p = THTensor_(data)(a);
-  long i, j;
-
-  /* Upper Triangular Case */
-  if (uplo[0] == 'U')
-  {
-    /* Clear lower triangle (excluding diagonals) */
-    for (i=0; i<n; i++) {
-     for (j=i+1; j<n; j++) {
-        p[n*i + j] = p[n*j+i];
-      }
-    }
-  }
-  /* Lower Triangular Case */
-  else if (uplo[0] == 'L')
-  {
-    /* Clear upper triangle (excluding diagonals) */
-    for (i=0; i<n; i++) {
-      for (j=0; j<i; j++) {
-        p[n*i + j] = p[n*j+i];
-      }
-    }
-  }
-}
-
-void THTensor_(potrf)(THTensor *ra_, THTensor *a, const char *uplo)
-{
-  if (a == NULL) a = ra_;
-  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
-  THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
-
-  int n, lda, info;
-  THTensor *ra__ = NULL;
-
-  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
-
-  n = ra__->size[0];
-  lda = n;
-
-  /* Run Factorization */
-  THLapack_(potrf)(uplo[0], n, THTensor_(data)(ra__), lda, &info);
-  THLapackCheckWithCleanup("Lapack Error in %s : the leading minor of order %d is not positive definite",
-                           THCleanup(THTensor_(free)(ra__);),
-                           "potrf", info, "");
-
-  THTensor_(clearUpLoTriangle)(ra__, uplo);
-  THTensor_(freeCopyTo)(ra__, ra_);
-}
-
-void THTensor_(potrs)(THTensor *rb_, THTensor *b, THTensor *a, const char *uplo)
-{
-  int free_b = 0;
-  if (b == NULL) b = rb_;
-
-  THArgCheck(a->nDimension == 2, 2, "A should have 2 dimensions, but has %d",
-      a->nDimension);
-  THArgCheck(b->nDimension == 1 || b->nDimension == 2, 1, "B should have 1 or 2 "
-      "dimensions, but has %d", b->nDimension);
-  THArgCheck(a->size[0] == a->size[1], 2, "A should be square, but is %ldx%ld",
-      a->size[0], a->size[1]);
-  THArgCheck(a->size[0] == b->size[0], 2, "A,B size incompatible - A has %ld "
-      "rows, B has %ld", a->size[0], b->size[0]);
-
-  if (b->nDimension == 1) {
-    b = THTensor_(newWithStorage2d)(b->storage, b->storageOffset, b->size[0],
-            b->stride[0], 1, 0);
-    free_b = 1;
-  }
-
-  int n, nrhs, lda, ldb, info;
-  THTensor *ra__; // working version of A matrix to be passed into lapack TRTRS
-  THTensor *rb__; // working version of B matrix to be passed into lapack TRTRS
-
-  ra__ = THTensor_(cloneColumnMajor)(NULL, a);
-  rb__ = THTensor_(cloneColumnMajor)(rb_, b);
-
-  n    = (int)ra__->size[0];
-  nrhs = (int)rb__->size[1];
-  lda  = n;
-  ldb  = n;
-
-  THLapack_(potrs)(uplo[0], n, nrhs, THTensor_(data)(ra__),
-                   lda, THTensor_(data)(rb__), ldb, &info);
-
-
-  THLapackCheckWithCleanup("Lapack Error in %s : A(%d,%d) is zero, singular A",
-                           THCleanup(
-                               THTensor_(free)(ra__);
-                               THTensor_(free)(rb__);
-                               if (free_b) THTensor_(free)(b);),
-                           "potrs", info, info);
-
-  if (free_b) THTensor_(free)(b);
-  THTensor_(free)(ra__);
-  THTensor_(freeCopyTo)(rb__, rb_);
-}
-
-void THTensor_(potri)(THTensor *ra_, THTensor *a, const char *uplo)
-{
-  if (a == NULL) a = ra_;
-  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
-  THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
-
-  int n, lda, info;
-  THTensor *ra__ = NULL;
-
-  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
-
-  n = ra__->size[0];
-  lda = n;
-
-  /* Run inverse */
-  THLapack_(potri)(uplo[0], n, THTensor_(data)(ra__), lda, &info);
-  THLapackCheckWithCleanup("Lapack Error %s : A(%d,%d) is 0, A cannot be factorized",
-                           THCleanup(THTensor_(free)(ra__);),
-                           "potri", info, info);
-
-  THTensor_(copyUpLoTriangle)(ra__, uplo);
-  THTensor_(freeCopyTo)(ra__, ra_);
-}
-
-/*
- Computes the Cholesky factorization with complete pivoting of a real symmetric
- positive semidefinite matrix.
-
- Args:
- * `ra_`    - result Tensor in which to store the factor U or L from the
-              Cholesky factorization.
- * `rpiv_`  - result IntTensor containing sparse permutation matrix P, encoded
-              as P[rpiv_[k], k] = 1.
- * `a`      - input Tensor; the input matrix to factorize.
- * `uplo`   - string; specifies whether the upper or lower triangular part of
-              the symmetric matrix A is stored. "U"/"L" for upper/lower
-              triangular.
- * `tol`    - double; user defined tolerance, or < 0 for automatic choice.
-              The algorithm terminates when the pivot <= tol.
- */
-void THTensor_(pstrf)(THTensor *ra_, THIntTensor *rpiv_, THTensor *a, const char *uplo, real tol) {
-  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
-  THArgCheck(a->size[0] == a->size[1], 1, "A should be square");
-
-  int n = a->size[0];
-
-  THTensor *ra__ = THTensor_(cloneColumnMajor)(ra_, a);
-  THIntTensor_resize1d(rpiv_, n);
-
-  // Allocate working tensor
-  THTensor *work = THTensor_(newWithSize1d)(2 * n);
-
-  // Run Cholesky factorization
-  int lda = n;
-  int rank, info;
-
-  THLapack_(pstrf)(uplo[0], n, THTensor_(data)(ra__), lda,
-                   THIntTensor_data(rpiv_), &rank, tol,
-                   THTensor_(data)(work), &info);
-
-  THLapackCheckWithCleanup("Lapack Error %s : matrix is rank deficient or not positive semidefinite",
-                           THCleanup(
-                               THTensor_(free)(ra__);
-                               THTensor_(free)(work);),
-                           "pstrf", info,"");
-
-  THTensor_(clearUpLoTriangle)(ra__, uplo);
-
-  THTensor_(freeCopyTo)(ra__, ra_);
-  THTensor_(free)(work);
-}
-
-/*
-  Perform a QR decomposition of a matrix.
-
-  In LAPACK, two parts of the QR decomposition are implemented as two separate
-  functions: geqrf and orgqr. For flexibility and efficiency, these are wrapped
-  directly, below - but to make the common usage convenient, we also provide
-  this function, which calls them both and returns the results in a more
-  intuitive form.
-
-  Args:
-  * `rq_` - result Tensor in which to store the Q part of the decomposition.
-  * `rr_` - result Tensor in which to store the R part of the decomposition.
-  * `a`   - input Tensor; the matrix to decompose.
-
-*/
-void THTensor_(qr)(THTensor *rq_, THTensor *rr_, THTensor *a)
-{
-  int m = a->size[0];
-  int n = a->size[1];
-  int k = (m < n ? m : n);
-  THTensor *ra_ = THTensor_(new)();
-  THTensor *rtau_ = THTensor_(new)();
-  THTensor *rr__ = THTensor_(new)();
-  THTensor_(geqrf)(ra_, rtau_, a);
-  THTensor_(resize2d)(rr__, k, ra_->size[1]);
-  THTensor_(narrow)(rr__, ra_, 0, 0, k);
-  THTensor_(triu)(rr_, rr__, 0);
-  THTensor_(resize2d)(rq_, ra_->size[0], k);
-  THTensor_(orgqr)(rq_, ra_, rtau_);
-  THTensor_(narrow)(rq_, rq_, 1, 0, k);
-  THTensor_(free)(ra_);
-  THTensor_(free)(rtau_);
-  THTensor_(free)(rr__);
-}
-
-/*
-  The geqrf function does the main work of QR-decomposing a matrix.
-  However, rather than producing a Q matrix directly, it produces a sequence of
-  elementary reflectors which may later be composed to construct Q - for example
-  with the orgqr function, below.
-
-  Args:
-  * `ra_`   - Result matrix which will contain:
-              i)  The elements of R, on and above the diagonal.
-              ii) Directions of the reflectors implicitly defining Q.
-  * `rtau_` - Result tensor which will contain the magnitudes of the reflectors
-              implicitly defining Q.
-  * `a`     - Input matrix, to decompose. If NULL, `ra_` is used as input.
-
-  For further details, please see the LAPACK documentation.
-
-*/
-void THTensor_(geqrf)(THTensor *ra_, THTensor *rtau_, THTensor *a)
-{
-  if (a == NULL) ra_ = a;
-  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
-
-  THTensor *ra__ = NULL;
-
-  /* Prepare the input for LAPACK, making a copy if necessary. */
-  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
-
-  int m = ra__->size[0];
-  int n = ra__->size[1];
-  int k = (m < n ? m : n);
-  int lda = m;
-  THTensor_(resize1d)(rtau_, k);
-
-  /* Dry-run to query the suggested size of the workspace. */
-  int info = 0;
-  real wkopt = 0;
-  THLapack_(geqrf)(m, n, THTensor_(data)(ra__), lda,
-                   THTensor_(data)(rtau_),
-                   &wkopt, -1, &info);
-
-  /* Allocate the workspace and call LAPACK to do the real work. */
-  int lwork = (int)wkopt;
-  THTensor *work = THTensor_(newWithSize1d)(lwork);
-  THLapack_(geqrf)(m, n, THTensor_(data)(ra__), lda,
-                   THTensor_(data)(rtau_),
-                   THTensor_(data)(work), lwork, &info);
-
-  THLapackCheckWithCleanup("Lapack Error %s : unknown Lapack error. info = %i",
-                           THCleanup(
-                               THTensor_(free)(ra__);
-                               THTensor_(free)(work);),
-                           "geqrf", info,"");
-
-  THTensor_(freeCopyTo)(ra__, ra_);
-  THTensor_(free)(work);
-}
-
-/*
-  The orgqr function allows reconstruction of a matrix Q with orthogonal
-  columns, from a sequence of elementary reflectors, such as is produced by the
-  geqrf function.
-
-  Args:
-  * `ra_` - result Tensor, which will contain the matrix Q.
-  * `a`   - input Tensor, which should be a matrix with the directions of the
-            elementary reflectors below the diagonal. If NULL, `ra_` is used as
-            input.
-  * `tau` - input Tensor, containing the magnitudes of the elementary
-            reflectors.
-
-  For further details, please see the LAPACK documentation.
-
-*/
-void THTensor_(orgqr)(THTensor *ra_, THTensor *a, THTensor *tau)
-{
-  if (a == NULL) a = ra_;
-  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
-
-  THTensor *ra__ = NULL;
-  ra__ = THTensor_(cloneColumnMajor)(ra_, a);
-
-  int m = ra__->size[0];
-  int n = ra__->size[1];
-  int k = tau->size[0];
-  int lda = m;
-
-  /* Dry-run to query the suggested size of the workspace. */
-  int info = 0;
-  real wkopt = 0;
-  THLapack_(orgqr)(m, k, k, THTensor_(data)(ra__), lda,
-                   THTensor_(data)(tau),
-                   &wkopt, -1, &info);
-
-  /* Allocate the workspace and call LAPACK to do the real work. */
-  int lwork = (int)wkopt;
-  THTensor *work = THTensor_(newWithSize1d)(lwork);
-  THLapack_(orgqr)(m, k, k, THTensor_(data)(ra__), lda,
-                   THTensor_(data)(tau),
-                   THTensor_(data)(work), lwork, &info);
-
-  THLapackCheckWithCleanup(" Lapack Error %s : unknown Lapack error. info = %i",
-                           THCleanup(
-                               THTensor_(free)(ra__);
-                               THTensor_(free)(work);),
-                           "orgqr", info,"");
-  THTensor_(freeCopyTo)(ra__, ra_);
-  THTensor_(free)(work);
-}
-
-/*
-  The ormqr function multiplies Q with another matrix from a sequence of
-  elementary reflectors, such as is produced by the geqrf function.
-
-  Args:
-  * `ra_`   - result Tensor, which will contain the matrix Q' c.
-  * `a`     - input Tensor, which should be a matrix with the directions of the
-              elementary reflectors below the diagonal. If NULL, `ra_` is used as
-              input.
-  * `tau`   - input Tensor, containing the magnitudes of the elementary
-              reflectors.
-  * `c`     - input Tensor, containing the matrix to be multiplied.
-  * `side`  - char, determining whether c is left- or right-multiplied with Q.
-  * `trans` - char, determining whether to transpose Q before multiplying.
-
-  For further details, please see the LAPACK documentation.
-
-*/
-void THTensor_(ormqr)(THTensor *ra_, THTensor *a, THTensor *tau, THTensor *c, const char *side, const char *trans)
-{
-  if (a == NULL) a = ra_;
-  THArgCheck(a->nDimension == 2, 1, "A should be 2 dimensional");
-
-  THTensor *ra__ = NULL;
-  ra__ = THTensor_(cloneColumnMajor)(ra_, c);
-
-  int m = c->size[0];
-  int n = c->size[1];
-  int k = tau->size[0];
-  int lda;
-  if (*side == 'L')
-  {
-    lda = m;
-  }
-  else
-  {
-    lda = n;
-  }
-  int ldc = m;
-
-  /* Dry-run to query the suggested size of the workspace. */
-  int info = 0;
-  real wkopt = 0;
-  THLapack_(ormqr)(side[0], trans[0], m, n, k, THTensor_(data)(a), lda,
-                   THTensor_(data)(tau), THTensor_(data)(ra__), ldc,
-                   &wkopt, -1, &info);
-
-  /* Allocate the workspace and call LAPACK to do the real work. */
-  int lwork = (int)wkopt;
-  THTensor *work = THTensor_(newWithSize1d)(lwork);
-  THLapack_(ormqr)(side[0], trans[0], m, n, k, THTensor_(data)(a), lda,
-                   THTensor_(data)(tau), THTensor_(data)(ra__), ldc,
-                   THTensor_(data)(work), lwork, &info);
-
-  THLapackCheckWithCleanup(" Lapack Error %s : unknown Lapack error. info = %i",
-                           THCleanup(
-                               THTensor_(free)(ra__);
-                               THTensor_(free)(work);),
-                           "ormqr", info,"");
-  THTensor_(freeCopyTo)(ra__, ra_);
-  THTensor_(free)(work);
-}
-
-void THTensor_(btrifact)(THTensor *ra_, THIntTensor *rpivots_, THIntTensor *rinfo_, int pivot, THTensor *a)
-{
-  THArgCheck(THTensor_(nDimension)(a) == 3, 1, "expected 3D tensor, got %dD", THTensor_(nDimension)(a));
-  if (!pivot) {
-    THError("btrifact without pivoting is not implemented on the CPU");
-  }
-
-  if (ra_ != a) {
-    THTensor_(resizeAs)(ra_, a);
-    THTensor_(copy)(ra_, a);
-  }
-
-  int m = a->size[1];
-  int n = a->size[2];
-  if (m != n) {
-    THError("btrifact is only implemented for square matrices");
-  }
-  long num_batches = THTensor_(size)(a, 0);
-  THTensor *ra__;
-  int lda;
-
-  if (ra_->stride[1] == 1) {
-    // column ordered, what BLAS wants
-    lda = ra_->stride[2];
-    ra__ = ra_;
-  } else {
-    // not column ordered, need to make it such (requires copy)
-    THTensor *transp_r_ = THTensor_(newTranspose)(ra_, 1, 2);
-    ra__ = THTensor_(newClone)(transp_r_);
-    THTensor_(free)(transp_r_);
-    THTensor_(transpose)(ra__, NULL, 1, 2);
-    lda = ra__->stride[2];
-  }
-
-  THTensor *ai = THTensor_(new)();
-  THTensor *rai = THTensor_(new)();
-  THIntTensor *rpivoti = THIntTensor_new();
-
-  int info = 0;
-  int *info_ptr = &info;
-  if (rinfo_) {
-    THIntTensor_resize1d(rinfo_, num_batches);
-    info_ptr = THIntTensor_data(rinfo_);
-  }
-
-  THIntTensor_resize2d(rpivots_, num_batches, n);
-
-  long batch = 0;
-  for (; batch < num_batches; ++batch) {
-    THTensor_(select)(ai, a, 0, batch);
-    THTensor_(select)(rai, ra__, 0, batch);
-    THIntTensor_select(rpivoti, rpivots_, 0, batch);
-
-    THLapack_(getrf)(n, n, THTensor_(data)(rai), lda,
-                     THIntTensor_data(rpivoti), info_ptr);
-    if (rinfo_) {
-      info_ptr++;
-    } else if (info != 0) {
-      break;
-    }
-  }
-
-  THTensor_(free)(ai);
-  THTensor_(free)(rai);
-  THIntTensor_free(rpivoti);
-
-  if (ra__ != ra_) {
-    THTensor_(freeCopyTo)(ra__, ra_);
-  }
-
-  if (!rinfo_ && info != 0) {
-    THError("failed to factorize batch element %ld (info == %d)", batch, info);
-  }
-}
-
-void THTensor_(btrisolve)(THTensor *rb_, THTensor *b, THTensor *atf, THIntTensor *pivots)
-{
-  THArgCheck(THTensor_(nDimension)(atf) == 3, 1, "expected 3D tensor, got %dD",
-             THTensor_(nDimension)(atf));
-  THArgCheck(THTensor_(nDimension)(b) == 3 ||
-             THTensor_(nDimension)(b) == 2, 4, "expected 2D or 3D tensor");
-  THArgCheck(THTensor_(size)(atf, 0) ==
-             THTensor_(size)(b, 0), 3, "number of batches must be equal");
-  THArgCheck(THTensor_(size)(atf, 1) ==
-             THTensor_(size)(atf, 2), 3, "A matrices must be square");
-  THArgCheck(THTensor_(size)(atf, 1) ==
-             THTensor_(size)(b, 1), 3, "dimensions of A and b must be equal");
-
-  if (rb_ != b) {
-    THTensor_(resizeAs)(rb_, b);
-    THTensor_(copy)(rb_, b);
-  }
-
-  long num_batches = atf->size[0];
-  long n = atf->size[1];
-  int nrhs = rb_->nDimension > 2 ? rb_->size[2] : 1;
-
-  int lda, ldb;
-  THTensor *atf_;
-  THTensor *rb__;
-
-  // correct ordering of A
-  if (atf->stride[1] == 1) {
-    // column ordered, what BLAS wants
-    lda = atf->stride[2];
-    atf_ = atf;
-  } else {
-    // not column ordered, need to make it such (requires copy)
-    // it would be nice if we could use the op(A) flags to automatically
-    // transpose A if needed, but this leads to unpredictable behavior if the
-    // user clones A_tf later with a different ordering
-    THTensor *transp_r_ = THTensor_(newTranspose)(atf, 1, 2);
-    atf_ = THTensor_(newClone)(transp_r_);
-    THTensor_(free)(transp_r_);
-    THTensor_(transpose)(atf_, NULL, 1, 2);
-    lda = atf_->stride[2];
-  }
-
-  // correct ordering of B
-  if (rb_->stride[1] == 1) {
-    // column ordered
-    if (rb_->nDimension == 2 || rb_->size[2] == 1) {
-      ldb = n;
-    } else {
-      ldb = rb_->stride[2];
-    }
-    rb__ = rb_;
-  } else {
-    // make column ordered
-    if (rb_->nDimension > 2) {
-      THTensor *transp_r_ = THTensor_(newTranspose)(rb_, 1, 2);
-      rb__ = THTensor_(newClone)(transp_r_);
-      THTensor_(free)(transp_r_);
-      THTensor_(transpose)(rb__, NULL, 1, 2);
-      ldb = rb__->stride[2];
-    } else {
-      rb__ = THTensor_(newClone)(rb_);
-      ldb = n;
-    }
-  }
-
-  THTensor *ai = THTensor_(new)();
-  THTensor *rbi = THTensor_(new)();
-  THIntTensor *pivoti = THIntTensor_new();
-
-  if (!THIntTensor_isContiguous(pivots)) {
-      THError("Error: rpivots_ is not contiguous.");
-  }
-
-  for (long batch = 0; batch < num_batches; ++batch) {
-    THTensor_(select)(ai, atf_, 0, batch);
-    THTensor_(select)(rbi, rb__, 0, batch);
-    THIntTensor_select(pivoti, pivots, 0, batch);
-
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-    int info;
-    THLapack_(getrs)('N', n, nrhs, THTensor_(data)(ai), lda,
-                     THIntTensor_data(pivoti), THTensor_(data)(rbi),
-                     ldb, &info);
-    if (info != 0) {
-      THError("Error: Nonzero info.");
-    }
-#else
-    THError("Unimplemented");
-#endif
-  }
-
-  THTensor_(free)(ai);
-  THTensor_(free)(rbi);
-  THIntTensor_free(pivoti);
-
-  if (atf_ != atf) {
-    THTensor_(free)(atf_);
-  }
-
-  if (rb__ != rb_) {
-    THTensor_(freeCopyTo)(rb__, rb_);
-  }
-}
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/THTensorLapack.h b/contrib/lua-torch/torch7/lib/TH/generic/THTensorLapack.h
deleted file mode 100644
index 878594348..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/THTensorLapack.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/THTensorLapack.h"
-#else
-
-TH_API void THTensor_(gesv)(THTensor *rb_, THTensor *ra_, THTensor *b_, THTensor *a_);
-TH_API void THTensor_(trtrs)(THTensor *rb_, THTensor *ra_, THTensor *b_, THTensor *a_, const char *uplo, const char *trans, const char *diag);
-TH_API void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b_, THTensor *a_);
-TH_API void THTensor_(syev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *jobz, const char *uplo);
-TH_API void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, const char *jobvr);
-TH_API void THTensor_(gesvd)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *a, const char *jobu);
-TH_API void THTensor_(gesvd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra_, THTensor *a, const char *jobu);
-TH_API void THTensor_(getri)(THTensor *ra_, THTensor *a);
-TH_API void THTensor_(potrf)(THTensor *ra_, THTensor *a, const char *uplo);
-TH_API void THTensor_(potrs)(THTensor *rb_, THTensor *b_, THTensor *a_,  const char *uplo);
-TH_API void THTensor_(potri)(THTensor *ra_, THTensor *a, const char *uplo);
-TH_API void THTensor_(qr)(THTensor *rq_, THTensor *rr_, THTensor *a);
-TH_API void THTensor_(geqrf)(THTensor *ra_, THTensor *rtau_, THTensor *a);
-TH_API void THTensor_(orgqr)(THTensor *ra_, THTensor *a, THTensor *tau);
-TH_API void THTensor_(ormqr)(THTensor *ra_, THTensor *a, THTensor *tau, THTensor *c, const char *side, const char *trans);
-TH_API void THTensor_(pstrf)(THTensor *ra_, THIntTensor *rpiv_, THTensor*a, const char* uplo, real tol);
-
-TH_API void THTensor_(btrifact)(THTensor *ra_, THIntTensor *rpivots_, THIntTensor *rinfo_, int pivot, THTensor *a);
-TH_API void THTensor_(btrisolve)(THTensor *rb_, THTensor *b, THTensor *atf, THIntTensor *pivots);
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/THTensorMath.c b/contrib/lua-torch/torch7/lib/TH/generic/THTensorMath.c
deleted file mode 100644
index db7a0cb19..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/THTensorMath.c
+++ /dev/null
@@ -1,3275 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/THTensorMath.c"
-#else
-
-#ifndef NAN
-  #define NAN (nan(NULL))
-#endif
-
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-#define TH_OMP_OVERHEAD_THRESHOLD 100000
-
-#ifdef _OPENMP
-
-#ifndef _WIN32
-#define PRAGMA(P) _Pragma(#P)
-#else
-#define PRAGMA(P) __pragma(P)
-#endif
-
-#define TH_TENSOR_APPLY_CONTIG(TYPE, TENSOR, CODE) \
-{ \
-  ptrdiff_t TH_TENSOR_size = THTensor_(nElement)(TENSOR); \
-  PRAGMA(omp parallel if (TH_TENSOR_size > TH_OMP_OVERHEAD_THRESHOLD)) \
-  { \
-    size_t num_threads = omp_get_num_threads(); \
-    size_t tid = omp_get_thread_num(); \
-    ptrdiff_t TH_TENSOR_offset = tid * (TH_TENSOR_size / num_threads); \
-    ptrdiff_t TH_TENSOR_end = tid == num_threads - 1 ? TH_TENSOR_size : \
-      TH_TENSOR_offset + TH_TENSOR_size / num_threads; \
-    ptrdiff_t TENSOR##_len = TH_TENSOR_end - TH_TENSOR_offset; \
-    TYPE *TENSOR##_data = THTensor_(data)(TENSOR) + TH_TENSOR_offset; \
-    CODE \
-  } \
-}
-#else
-#define TH_TENSOR_APPLY_CONTIG(TYPE, TENSOR, CODE) \
-{ \
-  TYPE *TENSOR##_data = THTensor_(data)(TENSOR); \
-  ptrdiff_t TENSOR##_len = THTensor_(nElement)(TENSOR); \
-  CODE \
-}
-#endif
-
-#ifdef _OPENMP
-#define TH_TENSOR_APPLY2_CONTIG(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \
-{ \
-  ptrdiff_t TH_TENSOR_size = THTensor_(nElement)(TENSOR1); \
-  PRAGMA(omp parallel if (TH_TENSOR_size > TH_OMP_OVERHEAD_THRESHOLD)) \
-  { \
-    size_t num_threads = omp_get_num_threads(); \
-    size_t tid = omp_get_thread_num(); \
-    ptrdiff_t TH_TENSOR_offset = tid * (TH_TENSOR_size / num_threads); \
-    ptrdiff_t TH_TENSOR_end = tid == num_threads - 1 ? TH_TENSOR_size : \
-      TH_TENSOR_offset + TH_TENSOR_size / num_threads; \
-    ptrdiff_t TENSOR1##_len = TH_TENSOR_end - TH_TENSOR_offset; \
-    TYPE1 *TENSOR1##_data = THTensor_(data)(TENSOR1) + TH_TENSOR_offset; \
-    TYPE2 *TENSOR2##_data = THTensor_(data)(TENSOR2) + TH_TENSOR_offset; \
-    CODE \
-  } \
-}
-#else
-#define TH_TENSOR_APPLY2_CONTIG(TYPE1, TENSOR1, TYPE2, TENSOR2, CODE) \
-{ \
-  TYPE1 *TENSOR1##_data = THTensor_(data)(TENSOR1); \
-  TYPE2 *TENSOR2##_data = THTensor_(data)(TENSOR2); \
-  ptrdiff_t TENSOR1##_len = THTensor_(nElement)(TENSOR1); \
-  CODE \
-}
-#endif
-
-#ifdef _OPENMP
-#define TH_TENSOR_APPLY3_CONTIG(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE) \
-{ \
-  ptrdiff_t TH_TENSOR_size = THTensor_(nElement)(TENSOR1); \
-  PRAGMA(omp parallel if (TH_TENSOR_size > TH_OMP_OVERHEAD_THRESHOLD)) \
-  { \
-    size_t num_threads = omp_get_num_threads(); \
-    size_t tid = omp_get_thread_num(); \
-    ptrdiff_t TH_TENSOR_offset = tid * (TH_TENSOR_size / num_threads); \
-    ptrdiff_t TH_TENSOR_end = tid == num_threads - 1 ? TH_TENSOR_size : \
-      TH_TENSOR_offset + TH_TENSOR_size / num_threads; \
-    ptrdiff_t TENSOR1##_len = TH_TENSOR_end - TH_TENSOR_offset; \
-    TYPE1 *TENSOR1##_data = THTensor_(data)(TENSOR1) + TH_TENSOR_offset; \
-    TYPE2 *TENSOR2##_data = THTensor_(data)(TENSOR2) + TH_TENSOR_offset; \
-    TYPE3 *TENSOR3##_data = THTensor_(data)(TENSOR3) + TH_TENSOR_offset; \
-    CODE \
-  } \
-}
-#else
-#define TH_TENSOR_APPLY3_CONTIG(TYPE1, TENSOR1, TYPE2, TENSOR2, TYPE3, TENSOR3, CODE) \
-{ \
-  TYPE1 *TENSOR1##_data = THTensor_(data)(TENSOR1); \
-  TYPE2 *TENSOR2##_data = THTensor_(data)(TENSOR2); \
-  TYPE3 *TENSOR3##_data = THTensor_(data)(TENSOR3); \
-  ptrdiff_t TENSOR1##_len = THTensor_(nElement)(TENSOR1); \
-  CODE \
-}
-#endif
-
-void THTensor_(fill)(THTensor *r_, real value)
-{
-  if (THTensor_(isContiguous)(r_) || THTensor_(isTransposed)(r_)) {
-    TH_TENSOR_APPLY_CONTIG(real, r_, THVector_(fill)(r__data, value, r__len););
-  } else {
-    TH_TENSOR_APPLY(real, r_,
-      if (r__stride == 1) {
-        THVector_(fill)(r__data, value, r__size);
-	r__i = r__size;
-	r__data += r__stride * r__size;
-	break;
-      } else {
-        *r__data = value;
-      }
-      );
-  }
-}
-
-void THTensor_(zero)(THTensor *r_)
-{
-  THTensor_(fill)(r_, 0);
-}
-
-void THTensor_(maskedFill)(THTensor *tensor, THByteTensor *mask, real value)
-{
-  TH_TENSOR_APPLY2(real, tensor, unsigned char, mask,
-                   if (*mask_data > 1)
-                   {
-                     THFree(mask_counter);
-                     THFree(tensor_counter);
-                     THError("Mask tensor can take 0 and 1 values only");
-                   }
-                   else if (*mask_data == 1)
-                   {
-                     *tensor_data = value;
-                   });
-}
-
-void THTensor_(maskedCopy)(THTensor *tensor, THByteTensor *mask, THTensor* src )
-{
-  THTensor *srct = THTensor_(newContiguous)(src);
-  real *src_data = THTensor_(data)(srct);
-  ptrdiff_t cntr = 0;
-  ptrdiff_t nelem = THTensor_(nElement)(srct);
-  if (THTensor_(nElement)(tensor) != THByteTensor_nElement(mask))
-  {
-    THTensor_(free)(srct);
-    THError("Number of elements of destination tensor != Number of elements in mask");
-  }
-  TH_TENSOR_APPLY2(real, tensor, unsigned char, mask,
-                   if (*mask_data > 1)
-                   {
-                     THTensor_(free)(srct);
-                     THFree(mask_counter);
-                     THFree(tensor_counter);
-                     THError("Mask tensor can take 0 and 1 values only");
-                   }
-                   else if (*mask_data == 1)
-                   {
-                     if (cntr == nelem)
-                     {
-                       THTensor_(free)(srct);
-                       THFree(mask_counter);
-                       THFree(tensor_counter);
-                       THError("Number of elements of src < number of ones in mask");
-                     }
-                     *tensor_data = *src_data;
-                     src_data++;
-                     cntr++;
-                   });
-  THTensor_(free)(srct);
-}
-
-void THTensor_(maskedSelect)(THTensor *tensor, THTensor *src, THByteTensor *mask)
-{
-  ptrdiff_t numel = THByteTensor_sumall(mask);
-  real *tensor_data;
-
-#ifdef DEBUG
-  THAssert(numel <= LONG_MAX);
-#endif
-  THTensor_(resize1d)(tensor,numel);
-  tensor_data = THTensor_(data)(tensor);
-  TH_TENSOR_APPLY2(real, src, unsigned char, mask,
-                   if (*mask_data > 1)
-                   {
-                     THFree(mask_counter);
-                     THFree(src_counter);
-                     THError("Mask tensor can take 0 and 1 values only");
-                   }
-                   else if (*mask_data == 1)
-                   {
-                     *tensor_data = *src_data;
-                     tensor_data++;
-                   });
-}
-
-// Finds non-zero elements of a tensor and returns their subscripts
-void THTensor_(nonzero)(THLongTensor *subscript, THTensor *tensor)
-{
-  ptrdiff_t numel = 0;
-  long *subscript_data;
-  long i = 0;
-  long dim;
-  long div = 1;
-#ifdef TH_REAL_IS_HALF
-#define IS_NONZERO(val) ((val.x & 0x7fff) != 0)
-#else
-#define IS_NONZERO(val) ((val)!=0)
-#endif
-
-  /* First Pass to determine size of subscripts */
-  TH_TENSOR_APPLY(real, tensor,
-                  if IS_NONZERO(*tensor_data) {
-                    ++numel;
-                  });
-#ifdef DEBUG
-  THAssert(numel <= LONG_MAX);
-#endif
-  THLongTensor_resize2d(subscript, numel, tensor->nDimension);
-
-  /* Second pass populates subscripts */
-  subscript_data = THLongTensor_data(subscript);
-  TH_TENSOR_APPLY(real, tensor,
-                  if IS_NONZERO(*tensor_data) {
-                    div = 1;
-
-                    for (dim = tensor->nDimension - 1; dim >= 0; dim--) {
-                      *(subscript_data + dim) = (i/div) % tensor->size[dim];
-                      div *= tensor->size[dim];
-                    }
-
-                    subscript_data += tensor->nDimension;
-                  }
-                  ++i;);
-}
-
-void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTensor *index)
-{
-  ptrdiff_t i, numel;
-  THLongStorage *newSize;
-  THTensor *tSlice, *sSlice;
-  long *index_data;
-  real *tensor_data, *src_data;
-
-  THArgCheck(index->nDimension == 1, 3, "Index is supposed to be a vector");
-  THArgCheck(dim < src->nDimension, 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
-  THArgCheck(src->nDimension > 0,2,"Source tensor is empty");
-
-  numel = THLongTensor_nElement(index);
-
-  newSize = THLongStorage_newWithSize(src->nDimension);
-  THLongStorage_rawCopy(newSize,src->size);
-#ifdef DEBUG
-  THAssert(numel <= LONG_MAX);
-#endif
-  newSize->data[dim] = numel;
-  THTensor_(resize)(tensor,newSize,NULL);
-  THLongStorage_free(newSize);
-
-  index = THLongTensor_newContiguous(index);
-  index_data = THLongTensor_data(index);
-
-  if (dim == 0 && THTensor_(isContiguous)(src) && THTensor_(isContiguous)(tensor))
-  {
-    tensor_data = THTensor_(data)(tensor);
-    src_data = THTensor_(data)(src);
-    ptrdiff_t rowsize = THTensor_(nElement)(src) / src->size[0];
-
-    // check that the indices are within range
-    long max = src->size[0] - 1 + TH_INDEX_BASE;
-    for (i=0; i<numel; i++) {
-      if (index_data[i] < TH_INDEX_BASE || index_data[i] > max) {
-        THLongTensor_free(index);
-        THError("index out of range");
-      }
-    }
-
-    if (src->nDimension == 1) {
-      #pragma omp parallel for if(numel > TH_OMP_OVERHEAD_THRESHOLD) private(i)
-      for (i=0; i<numel; i++)
-        tensor_data[i] = src_data[index_data[i] - TH_INDEX_BASE];
-    } else {
-      #pragma omp parallel for if(numel*rowsize > TH_OMP_OVERHEAD_THRESHOLD) private(i)
-      for (i=0; i<numel; i++)
-        memcpy(tensor_data + i*rowsize, src_data + (index_data[i] - TH_INDEX_BASE)*rowsize, rowsize*sizeof(real));
-    }
-  }
-  else if (src->nDimension == 1)
-  {
-    for (i=0; i<numel; i++)
-      THTensor_(set1d)(tensor,i,THTensor_(get1d)(src,index_data[i] - TH_INDEX_BASE));
-  }
-  else
-  {
-    for (i=0; i<numel; i++)
-    {
-      tSlice = THTensor_(new)();
-      sSlice = THTensor_(new)();
-      THTensor_(select)(tSlice, tensor, dim, i);
-      THTensor_(select)(sSlice, src, dim, index_data[i] - TH_INDEX_BASE);
-      THTensor_(copy)(tSlice, sSlice);
-      THTensor_(free)(tSlice);
-      THTensor_(free)(sSlice);
-    }
-  }
-
-  THLongTensor_free(index);
-}
-
-void THTensor_(indexCopy)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src)
-{
-  ptrdiff_t i, numel;
-  THTensor *tSlice, *sSlice;
-  long *index_data;
-
-  numel = THLongTensor_nElement(index);
-  THArgCheck(index->nDimension == 1, 3, "Index is supposed to be a vector");
-  THArgCheck(dim < src->nDimension, 4, "Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
-  THArgCheck(numel == src->size[dim],4,"Number of indices should be equal to source:size(dim)");
-
-  index = THLongTensor_newContiguous(index);
-  index_data = THLongTensor_data(index);
-
-  if (tensor->nDimension > 1 )
-  {
-    tSlice = THTensor_(new)();
-    sSlice = THTensor_(new)();
-
-    for (i=0; i<numel; i++)
-    {
-      THTensor_(select)(tSlice, tensor, dim, index_data[i] - TH_INDEX_BASE);
-      THTensor_(select)(sSlice, src, dim, i);
-      THTensor_(copy)(tSlice, sSlice);
-    }
-
-    THTensor_(free)(tSlice);
-    THTensor_(free)(sSlice);
-  }
-  else
-  {
-    for (i=0; i<numel; i++)
-    {
-      THTensor_(set1d)(tensor, index_data[i] - TH_INDEX_BASE, THTensor_(get1d)(src,i));
-    }
-  }
-  THLongTensor_free(index);
-}
-
-void THTensor_(indexAdd)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src)
-{
-  ptrdiff_t i, numel;
-  THTensor *tSlice, *sSlice;
-  long *index_data;
-
-  numel = THLongTensor_nElement(index);
-  THArgCheck(index->nDimension == 1, 3, "Index is supposed to be a vector");
-  THArgCheck(dim < src->nDimension, 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
-  THArgCheck(numel == src->size[dim],4,"Number of indices should be equal to source:size(dim)");
-
-  index = THLongTensor_newContiguous(index);
-  index_data = THLongTensor_data(index);
-
-  if (tensor->nDimension > 1)
-  {
-    tSlice = THTensor_(new)();
-    sSlice = THTensor_(new)();
-
-    for (i=0; i<numel; i++)
-    {
-      THTensor_(select)(tSlice, tensor, dim, index_data[i] - TH_INDEX_BASE);
-      THTensor_(select)(sSlice, src, dim, i);
-      THTensor_(cadd)(tSlice, tSlice, 1.0, sSlice);
-    }
-
-    THTensor_(free)(tSlice);
-    THTensor_(free)(sSlice);
-  }
-  else
-  {
-    for (i=0; i<numel; i++)
-    {
-      THTensor_(set1d)(tensor,
-              index_data[i] - TH_INDEX_BASE,
-              THTensor_(get1d)(src,i) + THTensor_(get1d)(tensor,index_data[i] - TH_INDEX_BASE));
-    }
-  }
-  THLongTensor_free(index);
-}
-
-void THTensor_(indexFill)(THTensor *tensor, int dim, THLongTensor *index, real val)
-{
-  ptrdiff_t i, numel;
-  THTensor *tSlice;
-  long *index_data;
-
-  numel = THLongTensor_nElement(index);
-  THArgCheck(index->nDimension == 1, 3, "Index is supposed to be a vector");
-  THArgCheck(dim < tensor->nDimension, 4,"Indexing dim %d is out of bounds of tensor", dim + TH_INDEX_BASE);
-
-  index = THLongTensor_newContiguous(index);
-  index_data = THLongTensor_data(index);
-
-  for (i=0; i<numel; i++)
-  {
-    if (tensor->nDimension > 1)
-    {
-      tSlice = THTensor_(new)();
-      THTensor_(select)(tSlice, tensor,dim,index_data[i] - TH_INDEX_BASE);
-      THTensor_(fill)(tSlice, val);
-      THTensor_(free)(tSlice);
-    }
-    else
-    {
-      THTensor_(set1d)(tensor, index_data[i] - TH_INDEX_BASE, val);
-    }
-  }
-  THLongTensor_free(index);
-}
-
-void THTensor_(gather)(THTensor *tensor, THTensor *src, int dim, THLongTensor *index)
-{
-  long elems_per_row, i, idx;
-
-  THArgCheck(THTensor_(nDimension)(src) == THTensor_(nDimension)(tensor), 2,
-             "Input tensor must have same dimensions as output tensor");
-  THArgCheck(dim < THTensor_(nDimension)(tensor), 3, "Index dimension is out of bounds");
-  THArgCheck(THLongTensor_nDimension(index) == THTensor_(nDimension)(src), 4,
-             "Index tensor must have same dimensions as input tensor");
-
-  elems_per_row = THLongTensor_size(index, dim);
-
-  TH_TENSOR_DIM_APPLY3(real, tensor, real, src, long, index, dim,
-                       for (i = 0; i < elems_per_row; ++i)
-                       {
-                         idx = *(index_data + i*index_stride);
-                         if (idx < TH_INDEX_BASE || idx >= src_size + TH_INDEX_BASE)
-                         {
-                           THFree(TH_TENSOR_DIM_APPLY_counter);
-                           THError("Invalid index in gather");
-                         }
-                         *(tensor_data + i*tensor_stride) = src_data[(idx - TH_INDEX_BASE) * src_stride];
-                       })
-}
-
-void THTensor_(scatter)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src)
-{
-  long elems_per_row, i, idx;
-
-  THArgCheck(dim < THTensor_(nDimension)(tensor), 2, "Index dimension is out of bounds");
-  THArgCheck(THLongTensor_nDimension(index) == THTensor_(nDimension)(tensor), 3,
-             "Index tensor must have same dimensions as output tensor");
-  THArgCheck(THTensor_(nDimension)(src) == THTensor_(nDimension)(tensor), 4,
-             "Input tensor must have same dimensions as output tensor");
-
-  elems_per_row = THLongTensor_size(index, dim);
-
-  TH_TENSOR_DIM_APPLY3(real, tensor, real, src, long, index, dim,
-                       for (i = 0; i < elems_per_row; ++i)
-                       {
-                         idx = *(index_data + i*index_stride);
-                         if (idx < TH_INDEX_BASE || idx >= tensor_size + TH_INDEX_BASE)
-                         {
-                           THFree(TH_TENSOR_DIM_APPLY_counter);
-                           THError("Invalid index in scatter");
-                         }
-                         tensor_data[(idx - TH_INDEX_BASE) * tensor_stride] = *(src_data + i*src_stride);
-                       })
-}
-
-void THTensor_(scatterAdd)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src)
-{
-  long elems_per_row, i, idx;
-
-  THArgCheck(dim < THTensor_(nDimension)(tensor), 2, "Index dimension is out of bounds");
-  THArgCheck(THLongTensor_nDimension(index) == THTensor_(nDimension)(tensor), 3,
-             "Index tensor must have same dimensions as output tensor");
-  THArgCheck(THTensor_(nDimension)(src) == THTensor_(nDimension)(tensor), 4,
-             "Input tensor must have same dimensions as output tensor");
-
-  elems_per_row = THLongTensor_size(index, dim);
-
-  TH_TENSOR_DIM_APPLY3(real, tensor, real, src, long, index, dim,
-                       for (i = 0; i < elems_per_row; ++i)
-                       {
-                         idx = *(index_data + i*index_stride);
-                         if (idx < TH_INDEX_BASE || idx >= tensor_size + TH_INDEX_BASE)
-                         {
-                           THFree(TH_TENSOR_DIM_APPLY_counter);
-                           THError("Invalid index in scatterAdd");
-                         }
-                         tensor_data[(idx - TH_INDEX_BASE) * tensor_stride] += *(src_data + i*src_stride);
-                       })
-}
-
-void THTensor_(scatterFill)(THTensor *tensor, int dim, THLongTensor *index, real val)
-{
-  long elems_per_row, i, idx;
-
-  THArgCheck(dim < THTensor_(nDimension)(tensor), 2, "Index dimension is out of bounds");
-  THArgCheck(THLongTensor_nDimension(index) == THTensor_(nDimension)(tensor), 3,
-             "Index tensor must have same dimensions as output tensor");
-
-  elems_per_row = THLongTensor_size(index, dim);
-
-  TH_TENSOR_DIM_APPLY2(real, tensor, long, index, dim,
-                       for (i = 0; i < elems_per_row; ++i)
-                       {
-                         idx = *(index_data + i*index_stride);
-                         if (idx < TH_INDEX_BASE || idx >= tensor_size + TH_INDEX_BASE)
-                         {
-                           THFree(TH_TENSOR_DIM_APPLY_counter);
-                           THError("Invalid index in scatter");
-                         }
-                         tensor_data[(idx - TH_INDEX_BASE) * tensor_stride] = val;
-                       })
-}
-
-accreal THTensor_(dot)(THTensor *tensor, THTensor *src)
-{
-  accreal sum = 0;
-  /* we use a trick here. careful with that. */
-  TH_TENSOR_APPLY2(real, tensor, real, src,
-                   long sz = (tensor_size-tensor_i < src_size-src_i ? tensor_size-tensor_i : src_size-src_i);
-                   sum += THBlas_(dot)(sz, src_data, src_stride, tensor_data, tensor_stride);
-                   tensor_i += sz;
-                   src_i += sz;
-                   tensor_data += sz*tensor_stride;
-                   src_data += sz*src_stride;
-                   break;);
-  return sum;
-}
-
-
-#undef th_isnan
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-#define th_isnan(val) \
-(isnan(val))
-#else
-#define th_isnan(val) (0)
-#endif
-
-#undef th_isnan_break
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-#define th_isnan_break(val) \
-if (isnan(val)) break;
-#else
-#define th_isnan_break(val)
-#endif
-
-real THTensor_(minall)(THTensor *tensor)
-{
-  real theMin;
-  real value;
-
-  THArgCheck(tensor->nDimension > 0, 1, "tensor must have one dimension");
-  theMin = THTensor_(data)(tensor)[0];
-  TH_TENSOR_APPLY(real, tensor,
-                  value = *tensor_data;
-                  /* This is not the same as value<theMin in the case of NaNs */
-                  if(!(value >= theMin))
-                  {
-                    theMin = value;
-                    th_isnan_break(value)
-                  });
-  return theMin;
-}
-
-real THTensor_(maxall)(THTensor *tensor)
-{
-  real theMax;
-  real value;
-
-  THArgCheck(tensor->nDimension > 0, 1, "tensor must have one dimension");
-  theMax = THTensor_(data)(tensor)[0];
-  TH_TENSOR_APPLY(real, tensor,
-                  value = *tensor_data;
-                  /* This is not the same as value>theMax in the case of NaNs */
-                  if(!(value <= theMax))
-                  {
-                    theMax = value;
-                    th_isnan_break(value)
-                  });
-  return theMax;
-}
-
-static void THTensor_(quickselectnoidx)(real *arr, long k, long elements, long stride);
-
-real THTensor_(medianall)(THTensor *tensor)
-{
-  THArgCheck(tensor->nDimension > 0, 1, "tensor must have one dimension");
-
-  real theMedian;
-  ptrdiff_t numel;
-  long k;
-  THTensor *temp_;
-  real *temp__data;
-
-  numel = THTensor_(nElement)(tensor);
-  k = (numel-1) >> 1;
-
-  temp_ = THTensor_(newClone)(tensor);
-  temp__data = THTensor_(data)(temp_);
-
-  THTensor_(quickselectnoidx)(temp__data, k, numel, 1);
-
-  theMedian = temp__data[k];
-
-  THTensor_(free)(temp_);
-
-  return theMedian;
-}
-
-accreal THTensor_(sumall)(THTensor *tensor)
-{
-  accreal sum = 0;
-  TH_TENSOR_APPLY(real, tensor, sum += *tensor_data;);
-  return sum;
-}
-
-accreal THTensor_(prodall)(THTensor *tensor)
-{
-  accreal prod = 1;
-  TH_TENSOR_APPLY(real, tensor, prod *= *tensor_data;);
-  return prod;
-}
-
-void THTensor_(add)(THTensor *r_, THTensor *t, real value)
-{
-  THTensor_(resizeAs)(r_, t);
-  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
-    TH_TENSOR_APPLY2_CONTIG(real, r_, real, t, THVector_(adds)(r__data, t_data, value, r__len););
-  } else {
-    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data + value;);
-  }
-}
-
-void THTensor_(sub)(THTensor *r_, THTensor *t, real value)
-{
-  THTensor_(add)(r_, t, -value);
-}
-
-void THTensor_(mul)(THTensor *r_, THTensor *t, real value)
-{
-  THTensor_(resizeAs)(r_, t);
-  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
-    TH_TENSOR_APPLY2_CONTIG(real, r_, real, t, THVector_(muls)(r__data, t_data, value, r__len););
-  } else {
-    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data * value;);
-  }
-}
-
-void THTensor_(div)(THTensor *r_, THTensor *t, real value)
-{
-  THTensor_(resizeAs)(r_, t);
-  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
-    TH_TENSOR_APPLY2_CONTIG(real, r_, real, t, THVector_(divs)(r__data, t_data, value, r__len););
-  } else {
-    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data / value;);
-  }
-}
-
-void THTensor_(lshift)(THTensor *r_, THTensor *t, real value)
-{
-#if defined(TH_REAL_IS_FLOAT)
-  return THTensor_(mul)(r_, t, powf(2, value));
-#elif defined(TH_REAL_IS_DOUBLE)
-  return THTensor_(mul)(r_, t, pow(2, value));
-#elif defined(TH_REAL_IS_HALF)
-  return THError("lshift is not supported for torch.HalfTensor");
-#else
-  THTensor_(resizeAs)(r_, t);
-  if (THTensor_(isContiguous)(r_) &&
-      THTensor_(isContiguous)(t) &&
-      THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
-      real *tp = THTensor_(data)(t);
-      real *rp = THTensor_(data)(r_);
-      long sz = THTensor_(nElement)(t);
-      long i;
-      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD * 100) private(i)
-      for (i=0; i<sz; i++) {
-#if defined(TH_REAL_IS_BYTE)
-          rp[i] = ((real) tp[i]) << value;
-#else
-          rp[i] = ((unsigned real) tp[i]) << value;
-#endif
-      }
-  } else {
-#if defined(TH_REAL_IS_BYTE)
-      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (((real) *t_data) << value););
-#else
-      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (((unsigned real) *t_data) << value););
-#endif
-  }
-#endif
-}
-
-void THTensor_(rshift)(THTensor *r_, THTensor *t, real value)
-{
-#if defined(TH_REAL_IS_FLOAT)
-  return THTensor_(div)(r_, t, powf(2, value));
-#elif defined(TH_REAL_IS_DOUBLE)
-  return THTensor_(div)(r_, t, pow(2, value));
-#elif defined(TH_REAL_IS_HALF)
-  return THError("rshift is not supported for torch.HalfTensor");
-#else
-  THTensor_(resizeAs)(r_, t);
-  if (THTensor_(isContiguous)(r_) &&
-      THTensor_(isContiguous)(t) &&
-      THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
-      real *tp = THTensor_(data)(t);
-      real *rp = THTensor_(data)(r_);
-      long sz = THTensor_(nElement)(t);
-      long i;
-      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD * 100) private(i)
-      for (i=0; i<sz; i++) {
-#if defined(TH_REAL_IS_BYTE)
-          rp[i] = ((real) tp[i]) >> value;
-#else
-          rp[i] = ((unsigned real) tp[i]) >> value;
-#endif
-      }
-  } else {
-#if defined(TH_REAL_IS_BYTE)
-      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (((real) *t_data) >> value););
-#else
-      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (((unsigned real) *t_data) >> value););
-#endif
-  }
-#endif
-}
-
-void THTensor_(fmod)(THTensor *r_, THTensor *t, real value)
-{
-  THTensor_(resizeAs)(r_, t);
-  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
-
-      real *tp = THTensor_(data)(t);
-      real *rp = THTensor_(data)(r_);
-      ptrdiff_t sz = THTensor_(nElement)(t);
-      ptrdiff_t i;
-      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
-      for (i=0; i<sz; i++) {
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-          rp[i] = fmod(tp[i], value);
-#else
-          rp[i] = tp[i] % value;
-#endif
-      }
-  } else {
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = fmod(*t_data, value););
-#else
-      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (*t_data % value););
-#endif
-  }
-}
-
-void THTensor_(remainder)(THTensor *r_, THTensor *t, real value)
-{
-  THTensor_(resizeAs)(r_, t);
-  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
-      real *tp = THTensor_(data)(t);
-      real *rp = THTensor_(data)(r_);
-      ptrdiff_t sz = THTensor_(nElement)(t);
-      ptrdiff_t i;
-      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
-      for (i=0; i<sz; i++) {
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-          rp[i] = (value == 0)? NAN : tp[i] - value * floor(tp[i] / value);
-#else
-          // There is no NAN for integers
-          rp[i] = tp[i] % value;
-          if (rp[i] * value < 0)
-            rp[i] += value;
-#endif
-      }
-  } else {
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (value == 0)? NAN : *t_data - value * floor(*t_data / value););
-#else
-       // There is no NAN for integers
-      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data % value;
-                                          if (*r__data * value < 0) *r__data += value;);
-#endif
-  }
-}
-
-void THTensor_(bitand)(THTensor *r_, THTensor *t, real value)
-{
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
-  return THError("bitand is only supported for integer type tensors");
-#else
-  THTensor_(resizeAs)(r_, t);
-  if (THTensor_(isContiguous)(r_) &&
-      THTensor_(isContiguous)(t) &&
-      THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
-      real *tp = THTensor_(data)(t);
-      real *rp = THTensor_(data)(r_);
-      long sz = THTensor_(nElement)(t);
-      long i;
-      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD * 100) private(i)
-      for (i=0; i<sz; i++) {
-          rp[i] = tp[i] & value;
-      }
-  } else {
-      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data & value;);
-  }
-#endif
-}
-
-void THTensor_(bitor)(THTensor *r_, THTensor *t, real value)
-{
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
-  return THError("bitor is only supported for integer type tensors");
-#else
-  THTensor_(resizeAs)(r_, t);
-  if (THTensor_(isContiguous)(r_) &&
-      THTensor_(isContiguous)(t) &&
-      THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
-      real *tp = THTensor_(data)(t);
-      real *rp = THTensor_(data)(r_);
-      long sz = THTensor_(nElement)(t);
-      long i;
-      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD * 100) private(i)
-      for (i=0; i<sz; i++) {
-          rp[i] = tp[i] | value;
-      }
-  } else {
-      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data | value;);
-  }
-#endif
-}
-
-void THTensor_(bitxor)(THTensor *r_, THTensor *t, real value)
-{
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
-  return THError("bitxor is only supported for integer type tensors");
-#else
-  THTensor_(resizeAs)(r_, t);
-  if (THTensor_(isContiguous)(r_) &&
-      THTensor_(isContiguous)(t) &&
-      THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
-      real *tp = THTensor_(data)(t);
-      real *rp = THTensor_(data)(r_);
-      long sz = THTensor_(nElement)(t);
-      long i;
-      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD * 100) private(i)
-      for (i=0; i<sz; i++) {
-          rp[i] = tp[i] ^ value;
-      }
-  } else {
-      TH_TENSOR_APPLY2(real, r_, real, t, *r__data = *t_data ^ value;);
-  }
-#endif
-}
-
-void THTensor_(clamp)(THTensor *r_, THTensor *t, real min_value, real max_value)
-{
-  THTensor_(resizeAs)(r_, t);
-  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
-    real *tp = THTensor_(data)(t);
-    real *rp = THTensor_(data)(r_);
-    /* real t_val; */
-    ptrdiff_t sz = THTensor_(nElement)(t);
-    ptrdiff_t i;
-    #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
-    for (i=0; i<sz; i++)
-      rp[i] = (tp[i] < min_value) ? min_value : (tp[i] > max_value ? max_value : tp[i]);
-  } else {
-    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = (*t_data < min_value) ? min_value : (*t_data > max_value ? max_value : *t_data););
-  }
-}
-
-void THTensor_(cadd)(THTensor *r_, THTensor *t, real value, THTensor *src)
-{
-  THTensor_(resizeAs)(r_, t);
-  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(isContiguous)(src) && THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
-    if(r_ == t) {
-      THBlas_(axpy)(THTensor_(nElement)(t), value, THTensor_(data)(src), 1, THTensor_(data)(r_), 1);
-    } else {
-      TH_TENSOR_APPLY3_CONTIG(real, r_, real, t, real, src, THVector_(cadd)(r__data, t_data, src_data, value, r__len););
-    }
-  } else {
-    TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data + value * *src_data;);
-  }
-}
-
-void THTensor_(csub)(THTensor *r_, THTensor *t, real value,THTensor *src)
-{
-  THTensor_(cadd)(r_, t, -value, src);
-}
-
-void THTensor_(cmul)(THTensor *r_, THTensor *t, THTensor *src)
-{
-  THTensor_(resizeAs)(r_, t);
-  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(isContiguous)(src) && THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
-    TH_TENSOR_APPLY3_CONTIG(real, r_, real, t, real, src, THVector_(cmul)(r__data, t_data, src_data, r__len););
-  } else {
-    TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data * *src_data;);
-  }
-}
-
-void THTensor_(cpow)(THTensor *r_, THTensor *t, THTensor *src)
-{
-  THTensor_(resizeAs)(r_, t);
-  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(isContiguous)(src) && THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
-    real *tp = THTensor_(data)(t);
-    real *sp = THTensor_(data)(src);
-    real *rp = THTensor_(data)(r_);
-    ptrdiff_t sz = THTensor_(nElement)(t);
-    ptrdiff_t i;
-    #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
-    for (i=0; i<sz; i++)
-      rp[i] = pow(tp[i], sp[i]);
-  } else {
-    TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = pow(*t_data, *src_data););
-  }
-}
-
-void THTensor_(cdiv)(THTensor *r_, THTensor *t, THTensor *src)
-{
-  THTensor_(resizeAs)(r_, t);
-  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(isContiguous)(src) && THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
-    TH_TENSOR_APPLY3_CONTIG(real, r_, real, t, real, src, THVector_(cdiv)(r__data, t_data, src_data, r__len););
-  } else {
-    TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data / *src_data;);
-  }
-}
-
-void THTensor_(clshift)(THTensor *r_, THTensor *t, THTensor *src)
-{
-#if defined(TH_REAL_IS_HALF)
-  return THError("clshift is not supported for torch.HalfTensor");
-#endif
-  THTensor_(resizeAs)(r_, t);
-  if (THTensor_(isContiguous)(r_) &&
-      THTensor_(isContiguous)(t) &&
-      THTensor_(isContiguous)(src) &&
-      THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
-      real *tp = THTensor_(data)(t);
-      real *sp = THTensor_(data)(src);
-      real *rp = THTensor_(data)(r_);
-      ptrdiff_t sz = THTensor_(nElement)(t);
-      ptrdiff_t i;
-      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
-    for (i=0; i<sz; i++) {
-#if defined(TH_REAL_IS_FLOAT)
-      rp[i] = tp[i] * powf(2, sp[i]);
-#elif defined(TH_REAL_IS_DOUBLE)
-      rp[i] = tp[i] * pow(2, sp[i]);
-#elif defined(TH_REAL_IS_BYTE)
-      rp[i] = ((real) tp[i]) << sp[i];
-#else
-      rp[i] = ((unsigned real) tp[i]) << sp[i];
-#endif
-    }
-  } else {
-#if defined(TH_REAL_IS_FLOAT)
-      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data * powf(2, *src_data););
-#elif defined(TH_REAL_IS_DOUBLE)
-      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data * pow(2, *src_data););
-#elif defined(TH_REAL_IS_BYTE)
-      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = ((real)*t_data) << *src_data;);
-#else
-      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = ((unsigned real)*t_data) << *src_data;);
-#endif
-  }
-}
-
-void THTensor_(crshift)(THTensor *r_, THTensor *t, THTensor *src)
-{
-#if defined(TH_REAL_IS_HALF)
-  return THError("crshift is not supported for torch.HalfTensor");
-#endif
-  THTensor_(resizeAs)(r_, t);
-  if (THTensor_(isContiguous)(r_) &&
-      THTensor_(isContiguous)(t) &&
-      THTensor_(isContiguous)(src) &&
-      THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
-      real *tp = THTensor_(data)(t);
-      real *sp = THTensor_(data)(src);
-      real *rp = THTensor_(data)(r_);
-      ptrdiff_t sz = THTensor_(nElement)(t);
-      ptrdiff_t i;
-      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
-    for (i=0; i<sz; i++) {
-#if defined(TH_REAL_IS_FLOAT)
-      rp[i] = tp[i] / powf(2, sp[i]);
-#elif defined(TH_REAL_IS_DOUBLE)
-      rp[i] = tp[i] / pow(2, sp[i]);
-#elif defined(TH_REAL_IS_BYTE)
-      rp[i] = ((real) tp[i]) >> sp[i];
-#else
-      rp[i] = ((unsigned real) tp[i]) >> sp[i];
-#endif
-    }
-  } else {
-#if defined(TH_REAL_IS_FLOAT)
-      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data / powf(2, *src_data););
-#elif defined(TH_REAL_IS_DOUBLE)
-      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data / pow(2, *src_data););
-#elif defined(TH_REAL_IS_BYTE)
-      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = ((real)*t_data) >> *src_data;);
-#else
-      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = ((unsigned real)*t_data) >> *src_data;);
-#endif
-  }
-}
-
-void THTensor_(cfmod)(THTensor *r_, THTensor *t, THTensor *src)
-{
-  THTensor_(resizeAs)(r_, t);
-  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(isContiguous)(src) && THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
-      real *tp = THTensor_(data)(t);
-      real *sp = THTensor_(data)(src);
-      real *rp = THTensor_(data)(r_);
-      ptrdiff_t sz = THTensor_(nElement)(t);
-      ptrdiff_t i;
-      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
-      for (i=0; i<sz; i++) {
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-          rp[i] = fmod(tp[i], sp[i]);
-#else
-          rp[i] = tp[i] % sp[i];
-#endif
-      }
-  } else {
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = fmod(*t_data, *src_data););
-#else
-      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = (*t_data % *src_data););
-#endif
-
-  }
-}
-
-void THTensor_(cremainder)(THTensor *r_, THTensor *t, THTensor *src)
-{
-  THTensor_(resizeAs)(r_, t);
-  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(isContiguous)(src) && THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
-      real *tp = THTensor_(data)(t);
-      real *sp = THTensor_(data)(src);
-      real *rp = THTensor_(data)(r_);
-      ptrdiff_t sz = THTensor_(nElement)(t);
-      ptrdiff_t i;
-      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
-      for (i=0; i<sz; i++) {
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-          rp[i] = (sp[i] == 0)? NAN : tp[i] - sp[i] * floor(tp[i] / sp[i]);
-#else
-          // There is no NAN for integers
-          rp[i] = tp[i] % sp[i];
-          if (rp[i] * sp[i] < 0)
-            rp[i] += sp[i];
-#endif
-      }
-  } else {
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = (*src_data == 0)? NAN : *t_data - *src_data * floor(*t_data / *src_data););
-#else
-      // There is no NAN for integers
-      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data % *src_data;
-                                                     if (*r__data * *src_data < 0) *r__data += *src_data;);
-#endif
-
-  }
-}
-
-void THTensor_(cbitand)(THTensor *r_, THTensor *t, THTensor *src)
-{
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
-  return THError("cbitand is only supported for integer type tensors");
-#else
-  THTensor_(resizeAs)(r_, t);
-  if (THTensor_(isContiguous)(r_) &&
-      THTensor_(isContiguous)(t) &&
-      THTensor_(isContiguous)(src) &&
-      THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
-      real *tp = THTensor_(data)(t);
-      real *sp = THTensor_(data)(src);
-      real *rp = THTensor_(data)(r_);
-      ptrdiff_t sz = THTensor_(nElement)(t);
-      ptrdiff_t i;
-      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
-    for (i=0; i<sz; i++) {
-      rp[i] = tp[i] & sp[i];
-    }
-  } else {
-      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data & *src_data;);
-  }
-#endif
-}
-
-void THTensor_(cbitor)(THTensor *r_, THTensor *t, THTensor *src)
-{
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
-  return THError("cbitor is only supported for integer type tensors");
-#else
-  THTensor_(resizeAs)(r_, t);
-  if (THTensor_(isContiguous)(r_) &&
-      THTensor_(isContiguous)(t) &&
-      THTensor_(isContiguous)(src) &&
-      THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
-      real *tp = THTensor_(data)(t);
-      real *sp = THTensor_(data)(src);
-      real *rp = THTensor_(data)(r_);
-      ptrdiff_t sz = THTensor_(nElement)(t);
-      ptrdiff_t i;
-      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
-    for (i=0; i<sz; i++) {
-      rp[i] = tp[i] | sp[i];
-    }
-  } else {
-      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data | *src_data;);
-  }
-#endif
-}
-
-void THTensor_(cbitxor)(THTensor *r_, THTensor *t, THTensor *src)
-{
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF)
-  return THError("cbitxor is only supported for integer type tensors");
-#else
-  THTensor_(resizeAs)(r_, t);
-  if (THTensor_(isContiguous)(r_) &&
-      THTensor_(isContiguous)(t) &&
-      THTensor_(isContiguous)(src) &&
-      THTensor_(nElement)(r_) == THTensor_(nElement)(src)) {
-      real *tp = THTensor_(data)(t);
-      real *sp = THTensor_(data)(src);
-      real *rp = THTensor_(data)(r_);
-      ptrdiff_t sz = THTensor_(nElement)(t);
-      ptrdiff_t i;
-      #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
-    for (i=0; i<sz; i++) {
-      rp[i] = tp[i] ^ sp[i];
-    }
-  } else {
-      TH_TENSOR_APPLY3(real, r_, real, t, real, src, *r__data = *t_data ^ *src_data;);
-  }
-#endif
-}
-
-void THTensor_(tpow)(THTensor *r_, real value, THTensor *t)
-{
-  THTensor_(resizeAs)(r_, t);
-  if (THTensor_(isContiguous)(r_) && THTensor_(isContiguous)(t) && THTensor_(nElement)(r_) == THTensor_(nElement)(t)) {
-    real *tp = THTensor_(data)(t);
-    real *rp = THTensor_(data)(r_);
-    ptrdiff_t sz = THTensor_(nElement)(t);
-    ptrdiff_t i;
-    #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i)
-    for (i=0; i<sz; i++)
-      rp[i] = pow(value, tp[i]);
-  } else {
-    TH_TENSOR_APPLY2(real, r_, real, t, *r__data = pow(value, *t_data););
-  }
-}
-
-void THTensor_(addcmul)(THTensor *r_, THTensor *t, real value, THTensor *src1, THTensor *src2)
-{
-  if(r_ != t)
-  {
-    THTensor_(resizeAs)(r_, t);
-    THTensor_(copy)(r_, t);
-  }
-
-  TH_TENSOR_APPLY3(real, r_, real, src1, real, src2, *r__data += value * *src1_data * *src2_data;);
-}
-
-
-void THTensor_(addcdiv)(THTensor *r_, THTensor *t, real value, THTensor *src1, THTensor *src2)
-{
-  if(r_ != t)
-  {
-    THTensor_(resizeAs)(r_, t);
-    THTensor_(copy)(r_, t);
-  }
-
-  TH_TENSOR_APPLY3(real, r_, real, src1, real, src2, *r__data += value * *src1_data / *src2_data;);
-}
-
-void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *mat, THTensor *vec)
-{
-  if( (mat->nDimension != 2) || (vec->nDimension != 1) )
-    THError("matrix and vector expected, got %dD, %dD",
-      mat->nDimension, vec->nDimension);
-
-  if( mat->size[1] != vec->size[0] ) {
-    THDescBuff bm = THTensor_(sizeDesc)(mat);
-    THDescBuff bv = THTensor_(sizeDesc)(vec);
-    THError("size mismatch, %s, %s", bm.str, bv.str);
-  }
-
-  if(t->nDimension != 1)
-    THError("vector expected, got t: %dD", t->nDimension);
-
-  if(t->size[0] != mat->size[0]) {
-    THDescBuff bt = THTensor_(sizeDesc)(t);
-    THDescBuff bm = THTensor_(sizeDesc)(mat);
-    THError("size mismatch, t: %s, mat: %s", bt.str, bm.str);
-  }
-
-  if(r_ != t)
-  {
-    THTensor_(resizeAs)(r_, t);
-    THTensor_(copy)(r_, t);
-  }
-
-  if(mat->stride[0] == 1)
-  {
-    THBlas_(gemv)('n', mat->size[0], mat->size[1],
-                  alpha, THTensor_(data)(mat), mat->stride[1],
-                  THTensor_(data)(vec), vec->stride[0],
-                  beta, THTensor_(data)(r_), r_->stride[0]);
-  }
-  else if(mat->stride[1] == 1)
-  {
-    THBlas_(gemv)('t',  mat->size[1], mat->size[0],
-                  alpha, THTensor_(data)(mat), mat->stride[0],
-                  THTensor_(data)(vec), vec->stride[0],
-                  beta, THTensor_(data)(r_), r_->stride[0]);
-  }
-  else
-  {
-    THTensor *cmat = THTensor_(newContiguous)(mat);
-
-    THBlas_(gemv)('t',  mat->size[1], mat->size[0],
-                  alpha, THTensor_(data)(cmat), cmat->stride[0],
-                  THTensor_(data)(vec), vec->stride[0],
-                  beta, THTensor_(data)(r_), r_->stride[0]);
-
-    THTensor_(free)(cmat);
-  }
-}
-
-void THTensor_(match)(THTensor *r_, THTensor *m1, THTensor *m2, real gain)
-{
-  long N1 = m1->size[0];
-  long N2 = m2->size[0];
-  long dim;
-  real *m1_p;
-  real *m2_p;
-  real *r_p;
-  long i;
-
-  THTensor_(resize2d)(r_, N1, N2);
-
-  m1 = THTensor_(newContiguous)(m1);
-  m2 = THTensor_(newContiguous)(m2);
-
-  THTensor_(resize2d)(m1, N1, THTensor_(nElement)(m1) / N1);
-  THTensor_(resize2d)(m2, N2, THTensor_(nElement)(m2) / N2);
-
-  dim = m1->size[1];
-  THArgCheck(m1->size[1] == m2->size[1], 3, "m1 and m2 must have the same inner vector dim");
-
-  m1_p = THTensor_(data)(m1);
-  m2_p = THTensor_(data)(m2);
-  r_p = THTensor_(data)(r_);
-
-#pragma omp parallel for private(i)
-  for (i=0; i<N1; i++) {
-    long j,k;
-    for (j=0; j<N2; j++) {
-      real sum = 0;
-      for (k=0; k<dim; k++) {
-        real term = m1_p[ i*dim + k ] - m2_p[ j*dim + k ];
-        sum += term*term;
-      }
-      r_p[ i*N2 + j ] = gain * sum;
-    }
-  }
-
-  THTensor_(free)(m1);
-  THTensor_(free)(m2);
-}
-
-void THTensor_(addmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *m1, THTensor *m2)
-{
-  char transpose_r, transpose_m1, transpose_m2;
-  THTensor *r__, *m1_, *m2_;
-
-  if( (m1->nDimension != 2) || (m2->nDimension != 2))
-    THError("matrices expected, got %dD, %dD tensors", m1->nDimension, m2->nDimension);
-
-  if(m1->size[1] != m2->size[0]) {
-    THDescBuff bm1 = THTensor_(sizeDesc)(m1);
-    THDescBuff bm2 = THTensor_(sizeDesc)(m2);
-    THError("size mismatch, m1: %s, m2: %s", bm1.str, bm2.str);
-  }
-
-  if( t->nDimension != 2 )
-    THError("matrix expected, got %dD tensor for t", t->nDimension);
-
-  if( (t->size[0] != m1->size[0]) || (t->size[1] != m2->size[1]) ) {
-    THDescBuff bt  = THTensor_(sizeDesc)(t);
-    THDescBuff bm1 = THTensor_(sizeDesc)(m1);
-    THDescBuff bm2 = THTensor_(sizeDesc)(m2);
-    THError("size mismatch, t: %s, m1: %s, m2: %s", bt.str, bm1.str, bm2.str);
-  }
-
-  if(t != r_)
-  {
-    THTensor_(resizeAs)(r_, t);
-    THTensor_(copy)(r_, t);
-  }
-
-  /* r_ */
-  if(r_->stride[0] == 1 &&
-     r_->stride[1] != 0)
-  {
-    transpose_r = 'n';
-    r__ = r_;
-  }
-  else if(r_->stride[1] == 1 &&
-          r_->stride[0] != 0)
-  {
-    THTensor *swap = m2;
-    m2 = m1;
-    m1 = swap;
-    transpose_r = 't';
-    r__ = r_;
-  }
-  else
-  {
-    transpose_r = 'n';
-
-    THTensor *transp_r_ = THTensor_(newTranspose)(r_, 0, 1);
-    r__ = THTensor_(newClone)(transp_r_);
-    THTensor_(free)(transp_r_);
-    THTensor_(transpose)(r__, NULL, 0, 1);
-  }
-
-  /* m1 */
-  if(m1->stride[(transpose_r == 'n' ? 0 : 1)] == 1 &&
-     m1->stride[(transpose_r == 'n' ? 1 : 0)] != 0)
-  {
-    transpose_m1 = 'n';
-    m1_ = m1;
-  }
-  else if(m1->stride[(transpose_r == 'n' ? 1 : 0)] == 1 &&
-          m1->stride[(transpose_r == 'n' ? 0 : 1)] != 0)
-  {
-    transpose_m1 = 't';
-    m1_ = m1;
-  }
-  else
-  {
-    transpose_m1 = (transpose_r == 'n' ? 't' : 'n');
-    m1_ = THTensor_(newContiguous)(m1);
-  }
-
-  /* m2 */
-  if(m2->stride[(transpose_r == 'n' ? 0 : 1)] == 1 &&
-     m2->stride[(transpose_r == 'n' ? 1 : 0)] != 0)
-  {
-    transpose_m2 = 'n';
-    m2_ = m2;
-  }
-  else if(m2->stride[(transpose_r == 'n' ? 1 : 0)] == 1 &&
-          m2->stride[(transpose_r == 'n' ? 0 : 1)] != 0)
-  {
-    transpose_m2 = 't';
-    m2_ = m2;
-  }
-  else
-  {
-    transpose_m2 = (transpose_r == 'n' ? 't' : 'n');
-    m2_ = THTensor_(newContiguous)(m2);
-  }
-
-#pragma omp critical(blasgemm)
-  /* do the operation */
-  THBlas_(gemm)(transpose_m1,
-                transpose_m2,
-                r__->size[(transpose_r == 'n' ? 0 : 1)],
-                r__->size[(transpose_r == 'n' ? 1 : 0)],
-                m1_->size[(transpose_r == 'n' ? 1 : 0)],
-                alpha,
-                THTensor_(data)(m1_),
-                (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]),
-                THTensor_(data)(m2_),
-                (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]),
-                beta,
-                THTensor_(data)(r__),
-                r__->stride[(transpose_r == 'n' ? 1 : 0)]);
-
-  /* free intermediate variables */
-  if(m1_ != m1)
-    THTensor_(free)(m1_);
-
-  if(m2_ != m2)
-    THTensor_(free)(m2_);
-
-  if(r__ != r_)
-    THTensor_(freeCopyTo)(r__, r_);
-}
-
-void THTensor_(addr)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *vec1, THTensor *vec2)
-{
-  if( (vec1->nDimension != 1) || (vec2->nDimension != 1) )
-    THError("vector and vector expected, got %dD, %dD tensors",
-        vec1->nDimension, vec2->nDimension);
-
-  if(t->nDimension != 2)
-    THError("expected matrix, got %dD tensor for t", t->nDimension);
-
-  if( (t->size[0] != vec1->size[0]) || (t->size[1] != vec2->size[0]) ) {
-    THDescBuff bt  = THTensor_(sizeDesc)(t);
-    THDescBuff bv1 = THTensor_(sizeDesc)(vec1);
-    THDescBuff bv2 = THTensor_(sizeDesc)(vec2);
-    THError("size mismatch, t: %s, vec1: %s, vec2: %s", bt.str, bv1.str, bv2.str);
-  }
-
-  if(r_ != t)
-  {
-    THTensor_(resizeAs)(r_, t);
-    THTensor_(copy)(r_, t);
-  }
-
-  if(beta == 0) {
-    THTensor_(zero)(r_);
-  }
-  else if(beta != 1)
-    THTensor_(mul)(r_, r_, beta);
-
-  if(r_->stride[0] == 1)
-  {
-    THBlas_(ger)(vec1->size[0], vec2->size[0],
-                 alpha, THTensor_(data)(vec1), vec1->stride[0],
-                 THTensor_(data)(vec2), vec2->stride[0],
-                 THTensor_(data)(r_), r_->stride[1]);
-  }
-  else if(r_->stride[1] == 1)
-  {
-    THBlas_(ger)(vec2->size[0], vec1->size[0],
-                 alpha, THTensor_(data)(vec2), vec2->stride[0],
-                 THTensor_(data)(vec1), vec1->stride[0],
-                 THTensor_(data)(r_), r_->stride[0]);
-  }
-  else
-  {
-    THTensor *cr = THTensor_(newClone)(r_);
-
-    THBlas_(ger)(vec2->size[0], vec1->size[0],
-                 alpha, THTensor_(data)(vec2), vec2->stride[0],
-                 THTensor_(data)(vec1), vec1->stride[0],
-                 THTensor_(data)(cr), cr->stride[0]);
-
-    THTensor_(freeCopyTo)(cr, r_);
-  }
-}
-
-void THTensor_(addbmm)(THTensor *result, real beta, THTensor *t, real alpha, THTensor *batch1, THTensor *batch2)
-{
-  long batch;
-
-  THArgCheck(THTensor_(nDimension)(batch1) == 3, 1, "expected 3D tensor");
-  THArgCheck(THTensor_(nDimension)(batch2) == 3, 2, "expected 3D tensor");
-  THArgCheck(THTensor_(size)(batch1, 0) == THTensor_(size)(batch2, 0), 2,
-             "equal number of batches expected, got %d, %d",
-             THTensor_(size)(batch1, 0), THTensor_(size)(batch2, 0));
-  THArgCheck(THTensor_(size)(batch1, 2) == THTensor_(size)(batch2, 1), 2,
-             "wrong matrix size, batch1: %dx%d, batch2: %dx%d",
-             THTensor_(size)(batch1, 1), THTensor_(size)(batch1,2),
-             THTensor_(size)(batch2, 1), THTensor_(size)(batch2,2));
-
-  long dim1 = THTensor_(size)(batch1, 1);
-  long dim2 = THTensor_(size)(batch2, 2);
-  THArgCheck(THTensor_(size)(t, 0) == dim1, 1, "output tensor of incorrect size");
-  THArgCheck(THTensor_(size)(t, 1) == dim2, 1, "output tensor of incorrect size");
-
-  if (t != result) {
-    THTensor_(resizeAs)(result, t);
-    THTensor_(copy)(result, t);
-  }
-
-  THTensor *matrix1 = THTensor_(new)();
-  THTensor *matrix2 = THTensor_(new)();
-
-  for (batch = 0; batch < THTensor_(size)(batch1, 0); ++batch) {
-    THTensor_(select)(matrix1, batch1, 0, batch);
-    THTensor_(select)(matrix2, batch2, 0, batch);
-
-    THTensor_(addmm)(result, beta, result, alpha, matrix1, matrix2);
-    beta = 1; // accumulate output once
-  }
-
-  THTensor_(free)(matrix1);
-  THTensor_(free)(matrix2);
-}
-
-void THTensor_(baddbmm)(THTensor *result, real beta, THTensor *t, real alpha, THTensor *batch1, THTensor *batch2)
-{
-  long batch;
-
-  THArgCheck(THTensor_(nDimension)(batch1) == 3, 1, "expected 3D tensor, got %dD", THTensor_(nDimension)(batch1));
-  THArgCheck(THTensor_(nDimension)(batch2) == 3, 2, "expected 3D tensor, got %dD", THTensor_(nDimension)(batch2));
-  THArgCheck(THTensor_(size)(batch1, 0) == THTensor_(size)(batch2, 0), 2,
-             "equal number of batches expected, got %d, %d",
-             THTensor_(size)(batch1, 0), THTensor_(size)(batch2, 0));
-  THArgCheck(THTensor_(size)(batch1, 2) == THTensor_(size)(batch2, 1), 2,
-             "wrong matrix size, batch1: %dx%d, batch2: %dx%d",
-             THTensor_(size)(batch1, 1), THTensor_(size)(batch1, 2),
-             THTensor_(size)(batch2, 1), THTensor_(size)(batch2, 2));
-
-  long bs = THTensor_(size)(batch1, 0);
-  long dim1 = THTensor_(size)(batch1, 1);
-  long dim2 = THTensor_(size)(batch2, 2);
-  THArgCheck(THTensor_(size)(t, 0) == bs, 1,   "output tensor of incorrect size");
-  THArgCheck(THTensor_(size)(t, 1) == dim1, 1, "output tensor of incorrect size");
-  THArgCheck(THTensor_(size)(t, 2) == dim2, 1, "output tensor of incorrect size");
-
-  if (t != result) {
-    THTensor_(resizeAs)(result, t);
-    THTensor_(copy)(result, t);
-  }
-
-  THTensor *matrix1 = THTensor_(new)();
-  THTensor *matrix2 = THTensor_(new)();
-  THTensor *result_matrix = THTensor_(new)();
-
-  for (batch = 0; batch < THTensor_(size)(batch1, 0); ++batch) {
-    THTensor_(select)(matrix1, batch1, 0, batch);
-    THTensor_(select)(matrix2, batch2, 0, batch);
-    THTensor_(select)(result_matrix, result, 0, batch);
-
-    THTensor_(addmm)(result_matrix, beta, result_matrix, alpha, matrix1, matrix2);
-  }
-
-  THTensor_(free)(matrix1);
-  THTensor_(free)(matrix2);
-  THTensor_(free)(result_matrix);
-}
-
-ptrdiff_t THTensor_(numel)(THTensor *t)
-{
-  return THTensor_(nElement)(t);
-}
-
-void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim)
-{
-  THLongStorage *dim;
-
-  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "dimension %d out of range",
-      dimension + TH_INDEX_BASE);
-
-  dim = THTensor_(newSizeOf)(t);
-  THLongStorage_set(dim, dimension, 1);
-  THTensor_(resize)(values_, dim, NULL);
-  THLongTensor_resize(indices_, dim, NULL);
-  THLongStorage_free(dim);
-
-  // two implementations optimized for data locality
-  if (t->stride[dimension] == 1) {
-    real theMax;
-    real value;
-    long theIndex;
-    long i;
-    TH_TENSOR_DIM_APPLY3(real, t, real, values_, long, indices_, dimension,
-                         theMax = t_data[0];
-                         theIndex = 0;
-
-                         for(i = 0; i < t_size; i++)
-                         {
-                           value = t_data[i*t_stride];
-                           /* This is not the same as value>theMax in the case of NaNs */
-                           if(!(value <= theMax))
-                           {
-                             theIndex = i;
-                             theMax = value;
-                             th_isnan_break(value)
-                           }
-                         }
-                         *indices__data = theIndex;
-                         *values__data = theMax;);
-  } else {
-    if (THTensor_(nDimension)(t) > 1) {
-      THTensor *t0 = THTensor_(newSelect)(t, dimension, 0);
-      THTensor_(copy)(values_, t0);
-      THTensor_(free)(t0);
-    } else {
-      THTensor_(fill)(values_, THTensor_(get1d)(t, 0));
-    }
-    THLongTensor_zero(indices_);
-
-    if(t->size[dimension] == 1) {
-      return;
-    }
-
-    THTensor *tempValues_ = THTensor_(newWithTensor)(values_);
-    // tempValues_.expand_as(t)
-    tempValues_->size[dimension] = t->size[dimension];
-    tempValues_->stride[dimension] = 0;
-
-    THLongTensor *tempIndices_ = THLongTensor_newWithTensor(indices_);
-    // tempIndices_.expand_as(t)
-    tempIndices_->size[dimension] = t->size[dimension];
-    tempIndices_->stride[dimension] = 0;
-
-    TH_TENSOR_APPLY3_D(real, t, real, tempValues_, long, tempIndices_, dimension,
-                          if(!(*t_data <= *tempValues__data) && !th_isnan(*tempValues__data)) {
-                            *tempValues__data = *t_data;
-                            *tempIndices__data = *tempIndices__dimOffset;
-                          });
-
-    THTensor_(free)(tempValues_);
-    THLongTensor_free(tempIndices_);
-  }
-
-  if (!keepdim) {
-    THTensor_(squeeze1d)(values_, values_, dimension);
-    THLongTensor_squeeze1d(indices_, indices_, dimension);
-  }
-}
-
-void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim)
-{
-  THLongStorage *dim;
-
-  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "dimension %d out of range",
-      dimension + TH_INDEX_BASE);
-
-  dim = THTensor_(newSizeOf)(t);
-  THLongStorage_set(dim, dimension, 1);
-  THTensor_(resize)(values_, dim, NULL);
-  THLongTensor_resize(indices_, dim, NULL);
-  THLongStorage_free(dim);
-
-  // two implementations optimized for data locality
-  if (t->stride[dimension] == 1) {
-    real theMax;
-    real value;
-    long theIndex;
-    long i;
-    TH_TENSOR_DIM_APPLY3(real, t, real, values_, long, indices_, dimension,
-                         theMax = t_data[0];
-                         theIndex = 0;
-
-                         for(i = 0; i < t_size; i++)
-                         {
-                           value = t_data[i*t_stride];
-                           /* This is not the same as value>theMax in the case of NaNs */
-                           if(!(value >= theMax))
-                           {
-                             theIndex = i;
-                             theMax = value;
-                             th_isnan_break(value)
-                           }
-                         }
-                         *indices__data = theIndex;
-                         *values__data = theMax;);
-  } else {
-    if (THTensor_(nDimension)(t) > 1) {
-      THTensor *t0 = THTensor_(newSelect)(t, dimension, 0);
-      THTensor_(copy)(values_, t0);
-      THTensor_(free)(t0);
-    } else {
-      THTensor_(fill)(values_, THTensor_(get1d)(t, 0));
-    }
-    THLongTensor_zero(indices_);
-
-    if(t->size[dimension] == 1) {
-      return;
-    }
-
-    THTensor *tempValues_ = THTensor_(newWithTensor)(values_);
-    // tempValues_.expand_as(t)
-    tempValues_->size[dimension] = t->size[dimension];
-    tempValues_->stride[dimension] = 0;
-
-    THLongTensor *tempIndices_ = THLongTensor_newWithTensor(indices_);
-    // tempIndices_.expand_as(t)
-    tempIndices_->size[dimension] = t->size[dimension];
-    tempIndices_->stride[dimension] = 0;
-
-    TH_TENSOR_APPLY3_D(real, t, real, tempValues_, long, tempIndices_, dimension,
-                          if(!(*t_data >= *tempValues__data) && !th_isnan(*tempValues__data)) {
-                            *tempValues__data = *t_data;
-                            *tempIndices__data = *tempIndices__dimOffset;
-                          });
-  }
-
-  if (!keepdim) {
-    THTensor_(squeeze1d)(values_, values_, dimension);
-    THLongTensor_squeeze1d(indices_, indices_, dimension);
-  }
-}
-
-
-void THTensor_(sum)(THTensor *r_, THTensor *t, int dimension, int keepdim)
-{
-  THLongStorage *dim;
-
-  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "dimension %d out of range",
-      dimension + TH_INDEX_BASE);
-
-  dim = THTensor_(newSizeOf)(t);
-  THLongStorage_set(dim, dimension, 1);
-  THTensor_(resize)(r_, dim, NULL);
-  THLongStorage_free(dim);
-
-  // two implementations optimized for data locality
-  if (t->stride[dimension] == 1) {
-    TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
-                         accreal sum = 0;
-                         long i;
-                         for(i = 0; i < t_size; i++)
-                           sum += t_data[i*t_stride];
-                         *r__data = (real)sum;);
-  } else {
-    THTensor_(zero)(r_);
-    THTensor *temp_ = THTensor_(newWithTensor)(r_);
-    // r_.expand_as(t)
-    temp_->size[dimension] = t->size[dimension];
-    temp_->stride[dimension] = 0;
-
-    TH_TENSOR_APPLY2(real, temp_, real, t, *temp__data = *temp__data + *t_data;);
-    THTensor_(free)(temp_);
-  }
-
-  if (!keepdim) {
-    THTensor_(squeeze1d)(r_, r_, dimension);
-  }
-}
-
-void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension, int keepdim)
-{
-  THLongStorage *dim;
-
-  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "dimension %d out of range",
-      dimension + TH_INDEX_BASE);
-
-  dim = THTensor_(newSizeOf)(t);
-  THLongStorage_set(dim, dimension, 1);
-  THTensor_(resize)(r_, dim, NULL);
-  THLongStorage_free(dim);
-
-  // two implementations optimized for data locality
-  if (t->stride[dimension] == 1) {
-    TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
-                         accreal prod = 1;
-                         long i;
-                         for(i = 0; i < t_size; i++)
-                           prod *= t_data[i*t_stride];
-                         *r__data = (real)prod;);
-  } else {
-    THTensor_(fill)(r_, 1);
-    THTensor *temp_ = THTensor_(newWithTensor)(r_);
-    // r_.expand_as(t)
-    temp_->size[dimension] = t->size[dimension];
-    temp_->stride[dimension] = 0;
-
-    TH_TENSOR_APPLY2(real, temp_, real, t, *temp__data = *temp__data * *t_data;);
-    THTensor_(free)(temp_);
-  }
-
-  if (!keepdim) {
-    THTensor_(squeeze1d)(r_, r_, dimension);
-  }
-}
-
-void THTensor_(cumsum)(THTensor *r_, THTensor *t, int dimension)
-{
-  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "dimension %d out of range",
-      dimension + TH_INDEX_BASE);
-
-  THTensor_(resizeAs)(r_, t);
-
-  TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
-                       accreal cumsum = 0;
-                       long i;
-                       for(i = 0; i < t_size; i++)
-                       {
-                         cumsum += t_data[i*t_stride];
-                         r__data[i*r__stride] = (real)cumsum;
-                       });
-}
-
-void THTensor_(cumprod)(THTensor *r_, THTensor *t, int dimension)
-{
-  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "dimension %d out of range",
-      dimension + TH_INDEX_BASE);
-
-  THTensor_(resizeAs)(r_, t);
-
-  TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
-                       accreal cumprod = 1;
-                       long i;
-                       for(i = 0; i < t_size; i++)
-                       {
-                         cumprod *= t_data[i*t_stride];
-                         r__data[i*r__stride] = (real)cumprod;
-                       });
-}
-
-
-void THTensor_(sign)(THTensor *r_, THTensor *t)
-{
-  THTensor_(resizeAs)(r_, t);
-
-#if defined (TH_REAL_IS_BYTE)
-  TH_TENSOR_APPLY2(real, r_, real, t,
-    if (*t_data > 0) *r__data = 1;
-    else *r__data = 0;);
-#else
-  TH_TENSOR_APPLY2(real, r_, real, t,
-    if (*t_data > 0) *r__data = 1;
-    else if (*t_data < 0) *r__data = -1;
-    else *r__data = 0;);
-#endif
-}
-
-
-accreal THTensor_(trace)(THTensor *t)
-{
-  real *t_data = THTensor_(data)(t);
-  accreal sum = 0;
-  long i = 0;
-  long t_stride_0, t_stride_1, t_diag_size;
-
-  THArgCheck(THTensor_(nDimension)(t) == 2, 1, "expected a matrix");
-
-  t_stride_0 = THTensor_(stride)(t, 0);
-  t_stride_1 = THTensor_(stride)(t, 1);
-  t_diag_size = THMin(THTensor_(size)(t, 0), THTensor_(size)(t, 1));
-  while(i < t_diag_size)
-  {
-    sum += t_data[i*(t_stride_0+t_stride_1)];
-    i++;
-  }
-
-  return sum;
-}
-
-void THTensor_(cross)(THTensor *r_, THTensor *a, THTensor *b, int dimension)
-{
-  int i;
-
-  if(THTensor_(nDimension)(a) != THTensor_(nDimension)(b))
-    THError("inconsistent tensor dimension %dD, %dD",
-        THTensor_(nDimension)(a), THTensor_(nDimension)(b));
-
-  for(i = 0; i < THTensor_(nDimension)(a); i++)
-  {
-    if(THTensor_(size)(a, i) != THTensor_(size)(b, i)) {
-        THDescBuff ba = THTensor_(sizeDesc)(a);
-        THDescBuff bb = THTensor_(sizeDesc)(b);
-        THError("inconsistent tensor sizes %s, %s", ba.str, bb.str);
-    }
-  }
-
-  if(dimension < 0)
-  {
-    for(i = 0; i < THTensor_(nDimension)(a); i++)
-    {
-      if(THTensor_(size)(a, i) == 3)
-      {
-        dimension = i;
-        break;
-      }
-    }
-    if(dimension < 0) {
-      THDescBuff ba = THTensor_(sizeDesc)(a);
-      THError("no dimension of size 3 in a: %s", ba.str);
-    }
-  }
-
-  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(a), 3, "dimension %d out of range",
-      dimension + TH_INDEX_BASE);
-  THArgCheck(THTensor_(size)(a, dimension) == 3, 3, "dimension %d does not have size 3",
-      dimension + TH_INDEX_BASE);
-
-  THTensor_(resizeAs)(r_, a);
-
-  TH_TENSOR_DIM_APPLY3(real, a, real, b, real, r_, dimension,
-                       r__data[0*r__stride] = a_data[1*a_stride]*b_data[2*b_stride] - a_data[2*a_stride]*b_data[1*b_stride];
-                       r__data[1*r__stride] = a_data[2*a_stride]*b_data[0*b_stride] - a_data[0*a_stride]*b_data[2*b_stride];
-                       r__data[2*r__stride] = a_data[0*a_stride]*b_data[1*b_stride] - a_data[1*a_stride]*b_data[0*b_stride];);
-}
-
-void THTensor_(cmax)(THTensor *r, THTensor *t, THTensor *src) {
-  THTensor_(resizeAs)(r, t);
-  TH_TENSOR_APPLY3(real, r, real, t, real, src,
-                   *r_data = *t_data > *src_data ? *t_data : *src_data;);
-}
-
-void THTensor_(cmin)(THTensor *r, THTensor *t, THTensor *src) {
-  THTensor_(resizeAs)(r, t);
-  TH_TENSOR_APPLY3(real, r, real, t, real, src,
-                   *r_data = *t_data < *src_data ? *t_data : *src_data;);
-}
-
-void THTensor_(cmaxValue)(THTensor *r, THTensor *t, real value) {
-  THTensor_(resizeAs)(r, t);
-  TH_TENSOR_APPLY2(real, r, real, t,
-                   *r_data = *t_data > value ? *t_data : value;);
-}
-
-void THTensor_(cminValue)(THTensor *r, THTensor *t, real value) {
-  THTensor_(resizeAs)(r, t);
-  TH_TENSOR_APPLY2(real, r, real, t,
-                   *r_data = *t_data < value ? *t_data : value;);
-}
-
-void THTensor_(zeros)(THTensor *r_, THLongStorage *size)
-{
-  THTensor_(resize)(r_, size, NULL);
-  THTensor_(zero)(r_);
-}
-
-void THTensor_(ones)(THTensor *r_, THLongStorage *size)
-{
-  THTensor_(resize)(r_, size, NULL);
-  THTensor_(fill)(r_, 1);
-}
-
-void THTensor_(diag)(THTensor *r_, THTensor *t, int k)
-{
-  THArgCheck(THTensor_(nDimension)(t) == 1 || THTensor_(nDimension)(t) == 2, 1, "matrix or a vector expected");
-
-  if(THTensor_(nDimension)(t) == 1)
-  {
-    real *t_data = THTensor_(data)(t);
-    long t_stride_0 = THTensor_(stride)(t, 0);
-    long t_size = THTensor_(size)(t, 0);
-    long sz = t_size + (k >= 0 ? k : -k);
-    real *r__data;
-    long r__stride_0;
-    long r__stride_1;
-    long i;
-
-    THTensor_(resize2d)(r_, sz, sz);
-    THTensor_(zero)(r_);
-    r__data = THTensor_(data)(r_);
-    r__stride_0 = THTensor_(stride)(r_, 0);
-    r__stride_1 = THTensor_(stride)(r_, 1);
-    r__data += (k >= 0 ? k*r__stride_1 : -k*r__stride_0);
-
-    for(i = 0; i < t_size; i++)
-      r__data[i*(r__stride_0+r__stride_1)] = t_data[i*t_stride_0];
-  }
-  else
-  {
-    real *t_data = THTensor_(data)(t);
-    long t_stride_0 = THTensor_(stride)(t, 0);
-    long t_stride_1 = THTensor_(stride)(t, 1);
-    long sz;
-    real *r__data;
-    long r__stride_0;
-    long i;
-
-    if(k >= 0)
-      sz = THMin(THTensor_(size)(t, 0), THTensor_(size)(t, 1)-k);
-    else
-      sz = THMin(THTensor_(size)(t, 0)+k, THTensor_(size)(t, 1));
-    THTensor_(resize1d)(r_, sz);
-    r__data = THTensor_(data)(r_);
-    r__stride_0 = THTensor_(stride)(r_, 0);
-
-    t_data += (k >= 0 ? k*t_stride_1 : -k*t_stride_0);
-    for(i = 0; i < sz; i++)
-      r__data[i*r__stride_0] = t_data[i*(t_stride_0+t_stride_1)];
-  }
-}
-
-void THTensor_(eye)(THTensor *r_, long n, long m)
-{
-  real *r__data;
-  long i, sz;
-
-  THArgCheck(n > 0, 1, "invalid argument");
-
-  if(m <= 0)
-    m = n;
-
-  THTensor_(resize2d)(r_, n, m);
-  THTensor_(zero)(r_);
-
-  i = 0;
-  r__data = THTensor_(data)(r_);
-  sz = THMin(THTensor_(size)(r_, 0), THTensor_(size)(r_, 1));
-  for(i = 0; i < sz; i++)
-    r__data[i*(r_->stride[0]+r_->stride[1])] = 1;
-}
-
-
-void THTensor_(range)(THTensor *r_, accreal xmin, accreal xmax, accreal step)
-{
-  ptrdiff_t size;
-  real i = 0;
-
-  THArgCheck(step > 0 || step < 0, 3, "step must be a non-null number");
-  THArgCheck(((step > 0) && (xmax >= xmin)) || ((step < 0) && (xmax <= xmin))
-              , 2, "upper bound and larger bound incoherent with step sign");
-
-  size = (ptrdiff_t) (((xmax - xmin) / step) + 1);
-
-  if (THTensor_(nElement)(r_) != size) {
-    THTensor_(resize1d)(r_, size);
-  }
-
-  TH_TENSOR_APPLY(real, r_, *r__data = xmin + (i++)*step;);
-}
-
-void THTensor_(arange)(THTensor *r_, accreal xmin, accreal xmax, accreal step) {
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-  int m = fmod(xmax - xmin,step) == 0;
-#else
-  int m = (xmax - xmin) % step == 0;
-#endif
-  if (m)
-    xmax -= step;
-  THTensor_(range)(r_,xmin,xmax,step);
-}
-
-void THTensor_(randperm)(THTensor *r_, THGenerator *_generator, long n)
-{
-  real *r__data;
-  long r__stride_0;
-  long i;
-
-  THArgCheck(n > 0, 1, "must be strictly positive");
-
-  THTensor_(resize1d)(r_, n);
-  r__data = THTensor_(data)(r_);
-  r__stride_0 = THTensor_(stride)(r_,0);
-
-  for(i = 0; i < n; i++)
-    r__data[i*r__stride_0] = (real)(i);
-
-  for(i = 0; i < n-1; i++)
-  {
-    long z = THRandom_random(_generator) % (n-i);
-    real sav = r__data[i*r__stride_0];
-    r__data[i*r__stride_0] = r__data[(z+i)*r__stride_0];
-    r__data[(z+i)*r__stride_0] = sav;
-  }
-}
-
-void THTensor_(reshape)(THTensor *r_, THTensor *t, THLongStorage *size)
-{
-  THTensor_(resize)(r_, size, NULL);
-  THTensor_(copy)(r_, t);
-}
-
-/* I cut and pasted (slightly adapted) the quicksort code from
-   Sedgewick's 1978 "Implementing Quicksort Programs" article
-   http://www.csie.ntu.edu.tw/~b93076/p847-sedgewick.pdf
-
-   It is the state of the art existing implementation. The macros
-   are here to make as close a match as possible to the pseudocode of
-   Program 2 p.851
-
-   Note that other partition schemes exist, and are typically presented
-   in textbook, but those are less efficient. See e.g.
-   http://cs.stackexchange.com/questions/11458/quicksort-partitioning-hoare-vs-lomuto
-
-   Julien, November 12th 2013
-*/
-#define MAX_LEVELS  300
-#define M_SMALL 10 /* Limit for small subfiles */
-
-#define ARR(III) arr[(III)*stride]
-#define IDX(III) idx[(III)*stride]
-
-#define LONG_SWAP(AAA, BBB) swap = AAA; AAA = BBB; BBB = swap
-#define REAL_SWAP(AAA, BBB) rswap = AAA; AAA = BBB; BBB = rswap
-
-#define ARR_SWAP(III, JJJ) \
-  REAL_SWAP(ARR(III), ARR(JJJ));
-
-#define BOTH_SWAP(III, JJJ) \
-  REAL_SWAP(ARR(III), ARR(JJJ)); \
-  LONG_SWAP(IDX(III), IDX(JJJ))
-
-static void THTensor_(quicksortascend)(real *arr, long *idx, long elements, long stride)
-{
-  long beg[MAX_LEVELS], end[MAX_LEVELS], i, j, L, R, P, swap, pid, stack = 0, sz_right, sz_left;
-  real rswap, piv;
-  unsigned char done = 0;
-
-  /* beg[0]=0; end[0]=elements; */
-  stack = 0;
-  L = 0; R = elements-1;
-  done = elements-1 <= M_SMALL;
-
-  while(!done) {
-      /* Use median of three for pivot choice */
-    P=(L+R)>>1;
-    BOTH_SWAP(P, L+1);
-    if (ARR(L+1) > ARR(R)) { BOTH_SWAP(L+1, R); }
-    if (ARR(L) > ARR(R)) { BOTH_SWAP(L, R); }
-    if (ARR(L+1) > ARR(L)) { BOTH_SWAP(L+1, L); }
-
-    i = L+1; j = R; piv = ARR(L); pid = IDX(L);
-
-    do {
-      do { i = i+1; } while(ARR(i) < piv);
-      do { j = j-1; } while(ARR(j) > piv);
-      if (j < i)
-          break;
-      BOTH_SWAP(i, j);
-    } while(1);
-    BOTH_SWAP(L, j);
-    /* Left subfile is (L, j-1) */
-    /* Right subfile is (i, R) */
-    sz_left = j-L;
-    sz_right = R-i+1;
-    if (sz_left <= M_SMALL && sz_right <= M_SMALL) {
-      /* both subfiles are small */
-      /* if stack empty */
-      if (stack == 0) {
-        done = 1;
-      } else {
-        stack--;
-        L = beg[stack];
-        R = end[stack];
-      }
-    } else if (sz_left <= M_SMALL || sz_right <= M_SMALL) {
-      /* exactly one of the subfiles is small */
-      /* (L,R) = large subfile */
-      if (sz_left > sz_right) {
-        /* Implicit: L = L; */
-        R = j-1;
-      } else {
-        L = i;
-        /* Implicit: R = R; */
-      }
-    } else {
-      /* none of the subfiles is small */
-      /* push large subfile */
-      /* (L,R) = small subfile */
-      if (sz_left > sz_right) {
-        beg[stack] = L;
-        end[stack] = j-1;
-        stack++;
-        L = i;
-        /* Implicit: R = R */
-      } else {
-        beg[stack] = i;
-        end[stack] = R;
-        stack++;
-        /* Implicit: L = L; */
-        R = j-1;
-      }
-    }
-  } /* while not done */
-  /* Now insertion sort on the concatenation of subfiles */
-  for(i=elements-2; i>=0; i--) {
-    if (ARR(i) > ARR(i+1)) {
-      piv = ARR(i);
-      pid = IDX(i);
-      j = i+1;
-      do {
-        ARR(j-1) = ARR(j);
-        IDX(j-1) = IDX(j);
-        j = j+1;
-      } while(j < elements && ARR(j) < piv);
-      ARR(j-1) = piv;
-      IDX(j-1) = pid;
-     }
-  }
-}
-
-static void THTensor_(quicksortdescend)(real *arr, long *idx, long elements, long stride)
-{
-  long beg[MAX_LEVELS], end[MAX_LEVELS], i, j, L, R, P, swap, pid, stack = 0, sz_right, sz_left;
-  real rswap, piv;
-  unsigned char done = 0;
-
-  /* beg[0]=0; end[0]=elements; */
-  stack = 0;
-  L = 0; R = elements-1;
-  done = elements-1 <= M_SMALL;
-
-  while(!done) {
-      /* Use median of three for pivot choice */
-    P=(L+R)>>1;
-    BOTH_SWAP(P, L+1);
-    if (ARR(L+1) < ARR(R)) { BOTH_SWAP(L+1, R); }
-    if (ARR(L) < ARR(R)) { BOTH_SWAP(L, R); }
-    if (ARR(L+1) < ARR(L)) { BOTH_SWAP(L+1, L); }
-
-    i = L+1; j = R; piv = ARR(L); pid = IDX(L);
-
-    do {
-      do { i = i+1; } while(ARR(i) > piv);
-      do { j = j-1; } while(ARR(j) < piv);
-      if (j < i)
-          break;
-      BOTH_SWAP(i, j);
-    } while(1);
-    BOTH_SWAP(L, j);
-    /* Left subfile is (L, j-1) */
-    /* Right subfile is (i, R) */
-    sz_left = j-L;
-    sz_right = R-i+1;
-    if (sz_left <= M_SMALL && sz_right <= M_SMALL) {
-      /* both subfiles are small */
-      /* if stack empty */
-      if (stack == 0) {
-        done = 1;
-      } else {
-        stack--;
-        L = beg[stack];
-        R = end[stack];
-      }
-    } else if (sz_left <= M_SMALL || sz_right <= M_SMALL) {
-      /* exactly one of the subfiles is small */
-      /* (L,R) = large subfile */
-      if (sz_left > sz_right) {
-        /* Implicit: L = L; */
-        R = j-1;
-      } else {
-        L = i;
-        /* Implicit: R = R; */
-      }
-    } else {
-      /* none of the subfiles is small */
-      /* push large subfile */
-      /* (L,R) = small subfile */
-      if (sz_left > sz_right) {
-        beg[stack] = L;
-        end[stack] = j-1;
-        stack++;
-        L = i;
-        /* Implicit: R = R */
-      } else {
-        beg[stack] = i;
-        end[stack] = R;
-        stack++;
-        /* Implicit: L = L; */
-        R = j-1;
-      }
-    }
-  } /* while not done */
-  /* Now insertion sort on the concatenation of subfiles */
-  for(i=elements-2; i>=0; i--) {
-    if (ARR(i) < ARR(i+1)) {
-      piv = ARR(i);
-      pid = IDX(i);
-      j = i+1;
-      do {
-        ARR(j-1) = ARR(j);
-        IDX(j-1) = IDX(j);
-        j = j+1;
-      } while(j < elements && ARR(j) > piv);
-      ARR(j-1) = piv;
-      IDX(j-1) = pid;
-     }
-  }
-}
-
-#undef MAX_LEVELS
-#undef M_SMALL
-
-void THTensor_(sort)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int dimension, int descendingOrder)
-{
-  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "invalid dimension %d",
-      dimension + TH_INDEX_BASE);
-
-  THTensor_(resizeAs)(rt_, t);
-  THTensor_(copy)(rt_, t);
-
-  {
-    THLongStorage *size = THTensor_(newSizeOf)(t);
-    THLongTensor_resize(ri_, size, NULL);
-    THLongStorage_free(size);
-  }
-
-  if(descendingOrder)
-  {
-    TH_TENSOR_DIM_APPLY2(real, rt_, long, ri_, dimension,
-                         long i;
-                         for(i = 0; i < ri__size; i++)
-                           ri__data[i*ri__stride] = i;
-                         THTensor_(quicksortdescend)(rt__data, ri__data, rt__size, rt__stride);)
-      }
-  else
-  {
-    TH_TENSOR_DIM_APPLY2(real, rt_, long, ri_, dimension,
-                         long i;
-                         for(i = 0; i < ri__size; i++)
-                           ri__data[i*ri__stride] = i;
-                         THTensor_(quicksortascend)(rt__data, ri__data, rt__size, rt__stride);)
-      }
-}
-
-/* Implementation of the Quickselect algorithm, based on Nicolas Devillard's
-public domain implementation at http://ndevilla.free.fr/median/median/
-Adapted similarly to the above Quicksort algorithm.
-This version does not produce indices along with values. */
-static void THTensor_(quickselectnoidx)(real *arr, long k, long elements, long stride)
-{
-  long P, L, R, i, j, swap;
-  real rswap, piv;
-  L = 0;
-  R = elements-1;
-
-  do {
-    if (R <= L) /* One element only */
-      return;
-
-    if (R == L+1) {  /* Two elements only */
-      if (ARR(L) > ARR(R)) {
-        ARR_SWAP(L, R);
-      }
-      return;
-    }
-
-    /* Use median of three for pivot choice */
-    P=(L+R)>>1;
-    ARR_SWAP(P, L+1);
-    if (ARR(L+1) > ARR(R)) { ARR_SWAP(L+1, R); }
-    if (ARR(L) > ARR(R)) { ARR_SWAP(L, R); }
-    if (ARR(L+1) > ARR(L)) { ARR_SWAP(L+1, L); }
-
-    i = L+1;
-    j = R;
-    piv = ARR(L);
-    do {
-      do i++; while(ARR(i) < piv);
-      do j--; while(ARR(j) > piv);
-      if (j < i)
-        break;
-      ARR_SWAP(i, j);
-    } while(1);
-    ARR_SWAP(L, j);
-
-    /* Re-set active partition */
-    if (j <= k) L=i;
-    if (j >= k) R=j-1;
-  } while(1);
-}
-
-/* Implementation of the Quickselect algorithm, based on Nicolas Devillard's
-public domain implementation at http://ndevilla.free.fr/median/median/
-Adapted similarly to the above Quicksort algorithm. */
-static void THTensor_(quickselect)(real *arr, long *idx, long k, long elements, long stride)
-{
-  long P, L, R, i, j, swap, pid;
-  real rswap, piv;
-  L = 0;
-  R = elements-1;
-
-  do {
-    if (R <= L) /* One element only */
-      return;
-
-    if (R == L+1) {  /* Two elements only */
-      if (ARR(L) > ARR(R)) {
-        BOTH_SWAP(L, R);
-      }
-      return;
-    }
-
-    /* Use median of three for pivot choice */
-    P=(L+R)>>1;
-    BOTH_SWAP(P, L+1);
-    if (ARR(L+1) > ARR(R)) { BOTH_SWAP(L+1, R); }
-    if (ARR(L) > ARR(R)) { BOTH_SWAP(L, R); }
-    if (ARR(L+1) > ARR(L)) { BOTH_SWAP(L+1, L); }
-
-    i = L+1;
-    j = R;
-    piv = ARR(L);
-    pid = IDX(L);
-    do {
-      do i++; while(ARR(i) < piv);
-      do j--; while(ARR(j) > piv);
-      if (j < i)
-        break;
-      BOTH_SWAP(i, j);
-    } while(1);
-    BOTH_SWAP(L, j);
-
-    /* Re-set active partition */
-    if (j <= k) L=i;
-    if (j >= k) R=j-1;
-  } while(1);
-}
-
-#undef ARR
-#undef IDX
-#undef LONG_SWAP
-#undef REAL_SWAP
-#undef BOTH_SWAP
-
-void THTensor_(mode)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim)
-{
-  THLongStorage *dim;
-  THTensor *temp_;
-  THLongTensor *tempi_;
-  real *temp__data;
-  long *tempi__data;
-  long t_size_dim;
-
-  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 3, "dimension out of range");
-
-  dim = THTensor_(newSizeOf)(t);
-  THLongStorage_set(dim, dimension, 1);
-  THTensor_(resize)(values_, dim, NULL);
-  THLongTensor_resize(indices_, dim, NULL);
-  THLongStorage_free(dim);
-
-  t_size_dim = THTensor_(size)(t, dimension);
-
-  temp_ = THTensor_(new)();
-  THTensor_(resize1d)(temp_, t_size_dim);
-  temp__data = THTensor_(data)(temp_);
-
-  tempi_ = THLongTensor_new();
-  THLongTensor_resize1d(tempi_, t_size_dim);
-  tempi__data = THLongTensor_data(tempi_);
-
-  TH_TENSOR_DIM_APPLY3(real, t, real, values_, long, indices_, dimension,
-                       long i;
-                       real mode = 0;
-                       long modei = 0;
-                       long temp_freq = 0;
-                       long max_freq = 0;
-                       for(i = 0; i < t_size_dim; i++)
-                          temp__data[i] = t_data[i*t_stride];
-                       for(i = 0; i < t_size_dim; i++)
-                          tempi__data[i] = i;
-                       THTensor_(quicksortascend)(temp__data, tempi__data, t_size_dim, 1);
-
-                       for(i = 0; i < t_size_dim; i++)
-                       {
-                          temp_freq++;
-                          if ((i == t_size_dim - 1) || (temp__data[i] != temp__data[i+1]))
-                          {
-                              if (temp_freq > max_freq)
-                              {
-                                 mode = temp__data[i];
-                                 modei = tempi__data[i];
-                                 max_freq = temp_freq;
-                              }
-                              temp_freq = 0;
-                          }
-                       }
-                       *values__data = mode;
-                       *indices__data = modei;);
-
-  THTensor_(free)(temp_);
-  THLongTensor_free(tempi_);
-  if (!keepdim) {
-    THTensor_(squeeze1d)(values_, values_, dimension);
-    THLongTensor_squeeze1d(indices_, indices_, dimension);
-  }
-}
-
-void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t, long k, int dimension, int keepdim)
-{
-  THLongStorage *dim;
-  THTensor *temp_;
-  THLongTensor *tempi_;
-  real *temp__data;
-  long *tempi__data;
-  long t_size_dim;
-
-  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 3, "dimension out of range");
-  THArgCheck(k > 0 && k <= t->size[dimension], 2, "selected index out of range");
-
-  dim = THTensor_(newSizeOf)(t);
-  THLongStorage_set(dim, dimension, 1);
-  THTensor_(resize)(values_, dim, NULL);
-  THLongTensor_resize(indices_, dim, NULL);
-  THLongStorage_free(dim);
-
-  t_size_dim = THTensor_(size)(t, dimension);
-
-  temp_ = THTensor_(new)();
-  THTensor_(resize1d)(temp_, t_size_dim);
-  temp__data = THTensor_(data)(temp_);
-
-  tempi_ = THLongTensor_new();
-  THLongTensor_resize1d(tempi_, t_size_dim);
-  tempi__data = THLongTensor_data(tempi_);
-
-  TH_TENSOR_DIM_APPLY3(real, t, real, values_, long, indices_, dimension,
-                       long i;
-                       for(i = 0; i < t_size_dim; i++)
-                          temp__data[i] = t_data[i*t_stride];
-                       for(i = 0; i < t_size_dim; i++)
-                          tempi__data[i] = i;
-                       THTensor_(quickselect)(temp__data, tempi__data, k - 1, t_size_dim, 1);
-                       *values__data = temp__data[k-1];
-                       *indices__data = tempi__data[k-1];);
-
-  THTensor_(free)(temp_);
-  THLongTensor_free(tempi_);
-  if (!keepdim) {
-    THTensor_(squeeze1d)(values_, values_, dimension);
-    THLongTensor_squeeze1d(indices_, indices_, dimension);
-  }
-}
-
-void THTensor_(median)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim)
-{
-  long t_size_dim, k;
-
-  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 3, "dimension out of range");
-
-  t_size_dim = THTensor_(size)(t, dimension);
-  k = (t_size_dim-1) >> 1; /* take middle or one-before-middle element */
-
-  THTensor_(kthvalue)(values_, indices_, t, k+1, dimension, keepdim);
-}
-
-void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, long k, int dim, int dir, int sorted)
-{
-  int numDims = THTensor_(nDimension)(t);
-  THArgCheck(dim >= 0 && dim < numDims, 3, "dim not in range");
-
-  long sliceSize = THTensor_(size)(t, dim);
-  THArgCheck(k > 0 && k <= sliceSize, 2, "k not in range for dimension");
-
-  THTensor *tmpResults = THTensor_(new)();
-  THTensor_(resize1d)(tmpResults, sliceSize);
-  real *tmp__data = THTensor_(data)(tmpResults);
-
-  THLongTensor *tmpIndices = THLongTensor_new();
-  THLongTensor_resize1d(tmpIndices, sliceSize);
-  long *tmpi__data = THLongTensor_data(tmpIndices);
-
-  THLongStorage *topKSize = THTensor_(newSizeOf)(t);
-  THLongStorage_set(topKSize, dim, k);
-  THTensor_(resize)(rt_, topKSize, NULL);
-  THLongTensor_resize(ri_, topKSize, NULL);
-  THLongStorage_free(topKSize);
-
-  if (dir) {
-    /* k largest elements, descending order (optional: see sorted) */
-    long K = sliceSize - k;
-    TH_TENSOR_DIM_APPLY3(real, t, real, rt_, long, ri_, dim,
-                         long i;
-                         for(i = 0; i < sliceSize; i++)
-                         {
-                           tmp__data[i] = t_data[i*t_stride];
-                           tmpi__data[i] = i;
-                         }
-                         if (K > 0)
-                           THTensor_(quickselect)(tmp__data, tmpi__data, K - 1, sliceSize, 1);
-                         if (sorted)
-                           THTensor_(quicksortdescend)(tmp__data + K, tmpi__data + K, k, 1);
-                         for(i = 0; i < k; i++)
-                         {
-                           rt__data[i*rt__stride] = tmp__data[i + K];
-                           ri__data[i*ri__stride] = tmpi__data[i + K];
-                         })
-  }
-  else {
-    /* k smallest elements, ascending order (optional: see sorted) */
-    TH_TENSOR_DIM_APPLY3(real, t, real, rt_, long, ri_, dim,
-                         long i;
-                         for(i = 0; i < sliceSize; i++)
-                         {
-                           tmp__data[i] = t_data[i*t_stride];
-                           tmpi__data[i] = i;
-                         }
-                         THTensor_(quickselect)(tmp__data, tmpi__data, k - 1, sliceSize, 1);
-                         if (sorted)
-                           THTensor_(quicksortascend)(tmp__data, tmpi__data, k - 1, 1);
-                         for(i = 0; i < k; i++)
-                         {
-                           rt__data[i*rt__stride] = tmp__data[i];
-                           ri__data[i*ri__stride] = tmpi__data[i];
-                         })
-  }
-
-  THTensor_(free)(tmpResults);
-  THLongTensor_free(tmpIndices);
-}
-
-void THTensor_(tril)(THTensor *r_, THTensor *t, long k)
-{
-  long t_size_0, t_size_1;
-  long t_stride_0, t_stride_1;
-  long r__stride_0, r__stride_1;
-  real *t_data, *r__data;
-  long r, c;
-
-  THArgCheck(THTensor_(nDimension)(t) == 2, 1, "expected a matrix");
-
-  THTensor_(resizeAs)(r_, t);
-
-  t_size_0 = THTensor_(size)(t, 0);
-  t_size_1 = THTensor_(size)(t, 1);
-  t_stride_0 = THTensor_(stride)(t, 0);
-  t_stride_1 = THTensor_(stride)(t, 1);
-  r__stride_0 = THTensor_(stride)(r_, 0);
-  r__stride_1 = THTensor_(stride)(r_, 1);
-  r__data = THTensor_(data)(r_);
-  t_data = THTensor_(data)(t);
-
-  for(r = 0; r < t_size_0; r++)
-  {
-    long sz = THMin(r+k+1, t_size_1);
-    for(c = THMax(0, r+k+1); c < t_size_1; c++)
-      r__data[r*r__stride_0+c*r__stride_1] = 0;
-    for(c = 0; c < sz; c++)
-      r__data[r*r__stride_0+c*r__stride_1] = t_data[r*t_stride_0+c*t_stride_1];
-  }
-}
-
-void THTensor_(triu)(THTensor *r_, THTensor *t, long k)
-{
-  long t_size_0, t_size_1;
-  long t_stride_0, t_stride_1;
-  long r__stride_0, r__stride_1;
-  real *t_data, *r__data;
-  long r, c;
-
-  THArgCheck(THTensor_(nDimension)(t) == 2, 1, "expected a matrix");
-
-  THTensor_(resizeAs)(r_, t);
-
-  t_size_0 = THTensor_(size)(t, 0);
-  t_size_1 = THTensor_(size)(t, 1);
-  t_stride_0 = THTensor_(stride)(t, 0);
-  t_stride_1 = THTensor_(stride)(t, 1);
-  r__stride_0 = THTensor_(stride)(r_, 0);
-  r__stride_1 = THTensor_(stride)(r_, 1);
-  r__data = THTensor_(data)(r_);
-  t_data = THTensor_(data)(t);
-
-  for(r = 0; r < t_size_0; r++)
-  {
-    long sz = THMin(r+k, t_size_1);
-    for(c = THMax(0, r+k); c < t_size_1; c++)
-      r__data[r*r__stride_0+c*r__stride_1] = t_data[r*t_stride_0+c*t_stride_1];
-    for(c = 0; c < sz; c++)
-      r__data[r*r__stride_0+c*r__stride_1] = 0;
-  }
-}
-
-void THTensor_(cat)(THTensor *r_, THTensor *ta, THTensor *tb, int dimension)
-{
-  THTensor* inputs[2];
-  inputs[0] = ta;
-  inputs[1] = tb;
-  THTensor_(catArray)(r_, inputs, 2, dimension);
-}
-
-void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int dimension)
-{
-  THLongStorage *size;
-  int i, j;
-  long offset;
-  int maxDim = dimension + 1;
-  int allEmpty = 1;
-  int allContiguous = 1;
-
-  // cat_dimension is the actual dimension we cat along
-  int cat_dimension = dimension;
-
-  for (i = 0; i < numInputs; i++)
-  {
-    maxDim = THMax(maxDim, inputs[i]->nDimension);
-  }
-
-  // When the user input dimension is -1 (i.e. -2 in C)
-  // Then we pick the maximum last dimension across all tensors.
-  if ( dimension + TH_INDEX_BASE == -1 )
-  {
-    cat_dimension = maxDim?(maxDim-1):0;
-  }
-
-  THArgCheck(numInputs > 0, 3, "invalid number of inputs %d", numInputs);
-  THArgCheck(cat_dimension >= 0, 4, "invalid dimension %d", dimension + TH_INDEX_BASE);
-
-  size = THLongStorage_newWithSize(maxDim);
-
-  for(i = 0; i < maxDim; i++)
-  {
-    // dimSize is either the size of the dim if it exists, either 1 if #dim > 0, otherwise 0
-    long dimSize = i < inputs[0]->nDimension ? inputs[0]->size[i] : THMin(inputs[0]->nDimension, 1);
-    if (i == cat_dimension)
-    {
-      for (j = 1; j < numInputs; j++)
-      {
-        // accumulate the size over the dimension we want to cat on.
-        // Empty tensors are allowed
-        dimSize += i < inputs[j]->nDimension ? inputs[j]->size[i] : THMin(inputs[j]->nDimension, 1);
-      }
-    }
-    else
-    {
-      for (j = 1; j < numInputs; j++)
-      {
-        long sz = (i < inputs[j]->nDimension ? inputs[j]->size[i] : THMin(inputs[j]->nDimension, 1));
-        // If it's a dimension we're not catting on
-        // Then fail if sizes are different AND > 0
-        if (dimSize != sz && dimSize && sz)
-        {
-          THLongStorage_free(size);
-          THError("inconsistent tensor sizes");
-        }
-        else if(!dimSize)
-        {
-          dimSize = sz;
-        }
-      }
-    }
-    allEmpty = allEmpty && !dimSize;
-    size->data[i] = dimSize;
-  }
-
-  // Initiate catting and resizing
-  // If at least one of the input is not empty
-  if (!allEmpty)
-  {
-    THTensor_(resize)(result, size, NULL);
-
-    // Check contiguity of all inputs and result
-    for (i = 0; i < numInputs; i++) {
-      if(inputs[i]->nDimension) {
-        allContiguous = allContiguous && THTensor_(isContiguous)(inputs[i]);
-      }
-    }
-    allContiguous = allContiguous && THTensor_(isContiguous)(result);
-
-    // First path is for contiguous inputs along dim 1
-    // Second path for non-contiguous
-    if (cat_dimension == 0 && allContiguous)
-    {
-      real* result_data = result->storage->data + result->storageOffset;
-      offset = 0;
-      for (j = 0; j < numInputs; j++)
-      {
-        if (inputs[j]->nDimension)
-        {
-          THTensor* input0 = inputs[j];
-          real* input0_data = input0->storage->data + input0->storageOffset;
-          long input0_size = THTensor_(nElement)(input0);
-          memcpy(result_data + offset, input0_data, input0_size*sizeof(real));
-          offset += input0_size;
-        }
-      }
-    }
-    else
-    {
-      offset = 0;
-      for (j = 0; j < numInputs; j++)
-      {
-        if (inputs[j]->nDimension)
-        {
-          long dimSize = cat_dimension < inputs[j]->nDimension ? inputs[j]->size[cat_dimension] : 1;
-          THTensor *nt = THTensor_(newWithTensor)(result);
-          THTensor_(narrow)(nt, NULL, cat_dimension, offset, dimSize);
-          THTensor_(copy)(nt, inputs[j]);
-          THTensor_(free)(nt);
-          offset += dimSize;
-        }
-      }
-    }
-  }
-  THLongStorage_free(size);
-}
-
-int THTensor_(equal)(THTensor *ta, THTensor* tb)
-{
-  int equal = 1;
-  if(!THTensor_(isSameSizeAs)(ta, tb))
-    return 0;
-
-  if (THTensor_(isContiguous)(ta) && THTensor_(isContiguous)(tb)) {
-    real *tap = THTensor_(data)(ta);
-    real *tbp = THTensor_(data)(tb);
-    ptrdiff_t sz = THTensor_(nElement)(ta);
-    ptrdiff_t i;
-    for (i=0; i<sz; ++i){
-      if(tap[i] != tbp[i]) return 0;
-    }
-  } else {
-    // Short-circuit the apply function on inequality
-    TH_TENSOR_APPLY2(real, ta, real, tb,
-                     if (equal && *ta_data != *tb_data) {
-                        equal = 0;
-                        TH_TENSOR_APPLY_hasFinished = 1; break;
-                     })
-  }
-  return equal;
-}
-
-#define TENSOR_IMPLEMENT_LOGICAL(NAME,OP)				\
-  void THTensor_(NAME##Value)(THByteTensor *r_, THTensor* t, real value)	\
-  {									\
-    THByteTensor_resizeNd(r_, t->nDimension, t->size, NULL);		\
-    TH_TENSOR_APPLY2(unsigned char, r_, real, t,			\
-		     *r__data = (*t_data OP value) ? 1 : 0;); \
-  }									\
-  void THTensor_(NAME##ValueT)(THTensor* r_, THTensor* t, real value)	\
-  {									\
-    THTensor_(resizeNd)(r_, t->nDimension, t->size, NULL);		\
-    TH_TENSOR_APPLY2(real, r_, real, t,					\
-		     *r__data = (*t_data OP value) ? 1 : 0;); \
-  }									\
-  void THTensor_(NAME##Tensor)(THByteTensor *r_, THTensor *ta, THTensor *tb) \
-  {									\
-    THByteTensor_resizeNd(r_, ta->nDimension, ta->size, NULL);		\
-    TH_TENSOR_APPLY3(unsigned char, r_, real, ta, real, tb,		\
-		     *r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
-  }									\
-  void THTensor_(NAME##TensorT)(THTensor *r_, THTensor *ta, THTensor *tb) \
-  {									\
-    THTensor_(resizeNd)(r_, ta->nDimension, ta->size, NULL);		\
-    TH_TENSOR_APPLY3(real, r_, real, ta, real, tb,			\
-		     *r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
-  }									\
-
-
-TENSOR_IMPLEMENT_LOGICAL(lt,<)
-TENSOR_IMPLEMENT_LOGICAL(gt,>)
-TENSOR_IMPLEMENT_LOGICAL(le,<=)
-TENSOR_IMPLEMENT_LOGICAL(ge,>=)
-TENSOR_IMPLEMENT_LOGICAL(eq,==)
-TENSOR_IMPLEMENT_LOGICAL(ne,!=)
-
-#define LAB_IMPLEMENT_BASIC_FUNCTION(NAME, CFUNC)             \
-  void THTensor_(NAME)(THTensor *r_, THTensor *t)                \
-  {                                                           \
-    THTensor_(resizeAs)(r_, t);                               \
-    TH_TENSOR_APPLY2(real, t, real, r_, *r__data = CFUNC(*t_data);); \
-  }                                                           \
-
-#define LAB_IMPLEMENT_BASIC_FUNCTION_VALUE(NAME, CFUNC)                 \
-  void THTensor_(NAME)(THTensor *r_, THTensor *t, real value)              \
-  {                                                                     \
-    THTensor_(resizeAs)(r_, t);                                         \
-    TH_TENSOR_APPLY2(real, t, real, r_, *r__data = CFUNC(*t_data, value);); \
-  }                                                                     \
-
-#if defined(TH_REAL_IS_LONG)
-LAB_IMPLEMENT_BASIC_FUNCTION(abs,labs)
-#endif /* long only part */
-
-#if defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT)
-LAB_IMPLEMENT_BASIC_FUNCTION(abs,abs)
-#endif /* int only part */
-
-#if defined(TH_REAL_IS_BYTE)
-
-#define TENSOR_IMPLEMENT_LOGICAL_SUM(NAME, OP, INIT_VALUE) \
-  int THTensor_(NAME)(THTensor *tensor) \
-  { \
-    THArgCheck(tensor->nDimension > 0, 1, "empty Tensor"); \
-    int sum = INIT_VALUE;                               \
-    TH_TENSOR_APPLY(real, tensor, sum = sum OP *tensor_data;); \
-    return sum; \
-  }
-
-TENSOR_IMPLEMENT_LOGICAL_SUM(logicalall, &&, 1)
-TENSOR_IMPLEMENT_LOGICAL_SUM(logicalany, ||, 0)
-
-#endif /* Byte only part */
-
-/* floating point only now */
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-
-#if defined (TH_REAL_IS_FLOAT)
-#define TH_MATH_NAME(fn) fn##f
-#else
-#define TH_MATH_NAME(fn) fn
-#endif
-
-LAB_IMPLEMENT_BASIC_FUNCTION(log,TH_MATH_NAME(log))
-LAB_IMPLEMENT_BASIC_FUNCTION(lgamma,TH_MATH_NAME(lgamma))
-LAB_IMPLEMENT_BASIC_FUNCTION(log1p,TH_MATH_NAME(log1p))
-LAB_IMPLEMENT_BASIC_FUNCTION(sigmoid,TH_MATH_NAME(TH_sigmoid))
-LAB_IMPLEMENT_BASIC_FUNCTION(exp,TH_MATH_NAME(exp))
-LAB_IMPLEMENT_BASIC_FUNCTION(cos,TH_MATH_NAME(cos))
-LAB_IMPLEMENT_BASIC_FUNCTION(acos,TH_MATH_NAME(acos))
-LAB_IMPLEMENT_BASIC_FUNCTION(cosh,TH_MATH_NAME(cosh))
-LAB_IMPLEMENT_BASIC_FUNCTION(sin,TH_MATH_NAME(sin))
-LAB_IMPLEMENT_BASIC_FUNCTION(asin,TH_MATH_NAME(asin))
-LAB_IMPLEMENT_BASIC_FUNCTION(sinh,TH_MATH_NAME(sinh))
-LAB_IMPLEMENT_BASIC_FUNCTION(tan,TH_MATH_NAME(tan))
-LAB_IMPLEMENT_BASIC_FUNCTION(atan,TH_MATH_NAME(atan))
-LAB_IMPLEMENT_BASIC_FUNCTION(tanh,TH_MATH_NAME(tanh))
-LAB_IMPLEMENT_BASIC_FUNCTION_VALUE(pow,TH_MATH_NAME(pow))
-LAB_IMPLEMENT_BASIC_FUNCTION(sqrt,TH_MATH_NAME(sqrt))
-LAB_IMPLEMENT_BASIC_FUNCTION(rsqrt,TH_MATH_NAME(TH_rsqrt))
-LAB_IMPLEMENT_BASIC_FUNCTION(ceil,TH_MATH_NAME(ceil))
-LAB_IMPLEMENT_BASIC_FUNCTION(floor,TH_MATH_NAME(floor))
-LAB_IMPLEMENT_BASIC_FUNCTION(round,TH_MATH_NAME(round))
-LAB_IMPLEMENT_BASIC_FUNCTION(abs,TH_MATH_NAME(fabs))
-LAB_IMPLEMENT_BASIC_FUNCTION(trunc,TH_MATH_NAME(trunc))
-LAB_IMPLEMENT_BASIC_FUNCTION(frac,TH_MATH_NAME(TH_frac))
-LAB_IMPLEMENT_BASIC_FUNCTION(neg,-)
-LAB_IMPLEMENT_BASIC_FUNCTION(cinv, TH_MATH_NAME(1.0) / )
-
-
-void THTensor_(atan2)(THTensor *r_, THTensor *tx, THTensor *ty)
-{
-  THTensor_(resizeAs)(r_, tx);
-  TH_TENSOR_APPLY3(real, r_, real, tx, real, ty, *r__data = TH_MATH_NAME(atan2)(*tx_data,*ty_data););
-}
-
-void THTensor_(lerp)(THTensor *r_, THTensor *a, THTensor *b, real weight)
-{
-  THArgCheck(THTensor_(nElement)(a) == THTensor_(nElement)(b), 2, "sizes do not match");
-  THTensor_(resizeAs)(r_, a);
-  TH_TENSOR_APPLY3(real, r_, real, a, real, b, *r__data = TH_MATH_NAME(TH_lerp)(*a_data, *b_data, weight););
-}
-
-void THTensor_(mean)(THTensor *r_, THTensor *t, int dimension, int keepdim)
-{
-  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 2, "invalid dimension %d",
-      dimension + TH_INDEX_BASE);
-
-  THTensor_(sum)(r_, t, dimension, keepdim);
-  THTensor_(div)(r_, r_, t->size[dimension]);
-}
-
-void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int flag, int keepdim)
-{
-  THLongStorage *dim;
-
-  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 3, "invalid dimension %d",
-      dimension + TH_INDEX_BASE);
-
-  dim = THTensor_(newSizeOf)(t);
-  THLongStorage_set(dim, dimension, 1);
-  THTensor_(resize)(r_, dim, NULL);
-  THLongStorage_free(dim);
-
-  TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
-                       accreal sum = 0;
-                       accreal sum2 = 0;
-                       long i;
-                       for(i = 0; i < t_size; i++)
-                       {
-                         real z = t_data[i*t_stride];
-                         sum += z;
-                         sum2 += z*z;
-                       }
-
-                       if(flag)
-                       {
-                         sum /= t_size;
-                         sum2 /= t_size;
-                         sum2 -= sum*sum;
-                         sum2 = (sum2 < 0 ? 0 : sum2);
-                         *r__data = (real)TH_MATH_NAME(sqrt)(sum2);
-                       }
-                       else
-                       {
-                         sum /= t_size;
-                         sum2 /= t_size-1;
-                         sum2 -= ((real)t_size)/((real)(t_size-1))*sum*sum;
-                         sum2 = (sum2 < 0 ? 0 : sum2);
-                         *r__data = (real)TH_MATH_NAME(sqrt)(sum2);
-                       });
-
-  if (!keepdim) {
-    THTensor_(squeeze1d)(r_, r_, dimension);
-  }
-}
-
-void THTensor_(var)(THTensor *r_, THTensor *t, int dimension, int flag, int keepdim)
-{
-  THLongStorage *dim;
-
-  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 3, "invalid dimension %d",
-      dimension + TH_INDEX_BASE);
-
-  dim = THTensor_(newSizeOf)(t);
-  THLongStorage_set(dim, dimension, 1);
-  THTensor_(resize)(r_, dim, NULL);
-  THLongStorage_free(dim);
-
-  TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
-                       accreal sum = 0;
-                       accreal sum2 = 0;
-                       long i;
-                       for(i = 0; i < t_size; i++)
-                       {
-                         real z = t_data[i*t_stride];
-                         sum += z;
-                         sum2 += z*z;
-                       }
-
-                       if(flag)
-                       {
-                         sum /= t_size;
-                         sum2 /= t_size;
-                         sum2 -= sum*sum;
-                         sum2 = (sum2 < 0 ? 0 : sum2);
-                         *r__data = sum2;
-                       }
-                       else
-                       {
-                         sum /= t_size;
-                         sum2 /= t_size-1;
-                         sum2 -= ((real)t_size)/((real)(t_size-1))*sum*sum;
-                         sum2 = (sum2 < 0 ? 0 : sum2);
-                         *r__data = (real)sum2;
-                       });
-
-  if (!keepdim) {
-    THTensor_(squeeze1d)(r_, r_, dimension);
-  }
-}
-
-void THTensor_(norm)(THTensor *r_, THTensor *t, real value, int dimension, int keepdim)
-{
-  THLongStorage *dim;
-
-  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(t), 3, "invalid dimension %d",
-      dimension + TH_INDEX_BASE);
-
-  dim = THTensor_(newSizeOf)(t);
-  THLongStorage_set(dim, dimension, 1);
-  THTensor_(resize)(r_, dim, NULL);
-  THLongStorage_free(dim);
-
-  if(value == 0) {
-    TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
-                         accreal sum = 0;
-                         long i;
-                         for(i = 0; i < t_size; i++)
-                           sum += t_data[i*t_stride] != 0.0;
-                         *r__data = sum;)
-  } else {
-    TH_TENSOR_DIM_APPLY2(real, t, real, r_, dimension,
-                         accreal sum = 0;
-                         long i;
-                         for(i = 0; i < t_size; i++) {
-                           sum += TH_MATH_NAME(pow)(
-                             TH_MATH_NAME(fabs)(t_data[i*t_stride]), value);
-                         }
-                         *r__data = TH_MATH_NAME(pow)(sum, 1.0/value);)
-  }
-
-  if (!keepdim) {
-    THTensor_(squeeze1d)(r_, r_, dimension);
-  }
-}
-
-accreal THTensor_(normall)(THTensor *tensor, real value)
-{
-  accreal sum = 0;
-  if(value == 0) {
-    TH_TENSOR_APPLY(real, tensor, sum += *tensor_data != 0.0;);
-    return sum;
-  } else if(value == 1) {
-    TH_TENSOR_APPLY(real, tensor, sum += TH_MATH_NAME(fabs)(*tensor_data););
-    return sum;
-  } else if(value == 2) {
-    TH_TENSOR_APPLY(real, tensor, accreal z = *tensor_data; sum += z*z;);
-    return sqrt(sum);
-  } else {
-    TH_TENSOR_APPLY(real, tensor, sum += TH_MATH_NAME(pow)(TH_MATH_NAME(fabs)(*tensor_data), value););
-    return TH_MATH_NAME(pow)(sum, 1.0/value);
-  }
-}
-
-void THTensor_(renorm)(THTensor *res, THTensor *src, real value, int dimension, real maxnorm)
-{
-  int i;
-  THTensor *rowR, *rowS;
-
-  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(src), 3, "invalid dimension %d",
-      dimension + TH_INDEX_BASE);
-  THArgCheck(value > 0, 2, "non-positive-norm not supported");
-  THArgCheck(THTensor_(nDimension)(src) > 1, 1, "need at least 2 dimensions, got %d dimensions",
-      THTensor_(nDimension)(src));
-
-  rowR = THTensor_(new)();
-  rowS = THTensor_(new)();
-
-  THTensor_(resizeAs)(res, src);
-
-  for (i=0; i<src->size[dimension]; i++)
-  {
-    real norm = 0;
-    real new_norm;
-
-    THTensor_(select)(rowS, src, dimension, i);
-    THTensor_(select)(rowR, res, dimension, i);
-    if (value == 1) {
-      TH_TENSOR_APPLY(real, rowS, norm += fabs(*rowS_data););
-    } else if (value == 2) {
-      TH_TENSOR_APPLY(real, rowS, accreal z = *rowS_data; norm += z*z;);
-    } else {
-      TH_TENSOR_APPLY(real, rowS, norm += TH_MATH_NAME(pow)(TH_MATH_NAME(fabs)(*rowS_data), value););
-    }
-
-    norm = pow(norm, 1/value);
-
-    if (norm > maxnorm)
-    {
-      new_norm = maxnorm / (norm + 1e-7);
-
-      TH_TENSOR_APPLY2(
-        real, rowR, real, rowS,
-        *rowR_data = (*rowS_data) * new_norm;
-      )
-    }
-    else
-      THTensor_(copy)(rowR, rowS);
-  }
-
-  THTensor_(free)(rowR);
-  THTensor_(free)(rowS);
-}
-
-accreal THTensor_(dist)(THTensor *tensor, THTensor *src, real value)
-{
-  real sum = 0;
-  TH_TENSOR_APPLY2(real, tensor, real, src,
-                   sum += TH_MATH_NAME(pow)(
-                     TH_MATH_NAME(fabs)(*tensor_data - *src_data), value););
-  return TH_MATH_NAME(pow)(sum, 1.0/value);
-}
-
-accreal THTensor_(meanall)(THTensor *tensor)
-{
-  THArgCheck(tensor->nDimension > 0, 1, "empty Tensor");
-  return THTensor_(sumall)(tensor)/THTensor_(nElement)(tensor);
-}
-
-accreal THTensor_(varall)(THTensor *tensor)
-{
-  accreal mean = THTensor_(meanall)(tensor);
-  accreal sum = 0;
-  TH_TENSOR_APPLY(real, tensor, sum += (*tensor_data - mean)*(*tensor_data - mean););
-  sum /= (THTensor_(nElement)(tensor)-1);
-  return sum;
-}
-
-accreal THTensor_(stdall)(THTensor *tensor)
-{
-  return sqrt(THTensor_(varall)(tensor));
-}
-
-void THTensor_(linspace)(THTensor *r_, real a, real b, long n)
-{
-  real i = 0;
-
-  THArgCheck(n > 1 || (n == 1 && (a == b)), 3, "invalid number of points");
-
-  if (THTensor_(nElement)(r_) != n) {
-    THTensor_(resize1d)(r_, n);
-  }
-
-  if(n == 1) {
-     TH_TENSOR_APPLY(real, r_,
-             *r__data = a;
-             i++;
-           );
-  } else {
-     TH_TENSOR_APPLY(real, r_,
-             *r__data = a + i*(b-a)/((real)(n-1));
-             i++;
-           );
-  }
-}
-
-void THTensor_(logspace)(THTensor *r_, real a, real b, long n)
-{
-  real i = 0;
-
-  THArgCheck(n > 1 || (n == 1 && (a == b)), 3, "invalid number of points");
-
-  if (THTensor_(nElement)(r_) != n) {
-    THTensor_(resize1d)(r_, n);
-  }
-
-  if(n == 1) {
-    TH_TENSOR_APPLY(real, r_,
-        *r__data = TH_MATH_NAME(pow)(10.0, a);
-        i++;
-        );
-  } else {
-    TH_TENSOR_APPLY(real, r_,
-        *r__data = TH_MATH_NAME(pow)(10.0, a + i*(b-a)/((real)(n-1)));
-        i++;
-        );
-  }
-}
-
-void THTensor_(rand)(THTensor *r_, THGenerator *_generator, THLongStorage *size)
-{
-  THTensor_(resize)(r_, size, NULL);
-  THTensor_(uniform)(r_, _generator, 0, 1);
-}
-
-void THTensor_(randn)(THTensor *r_, THGenerator *_generator, THLongStorage *size)
-{
-  THTensor_(resize)(r_, size, NULL);
-  THTensor_(normal)(r_, _generator, 0, 1);
-}
-
-void THTensor_(histc)(THTensor *hist, THTensor *tensor, long nbins, real minvalue, real maxvalue)
-{
-  real minval;
-  real maxval;
-  real *h_data;
-
-  THTensor_(resize1d)(hist, nbins);
-  THTensor_(zero)(hist);
-  minval = minvalue;
-  maxval = maxvalue;
-  if (minval == maxval)
-  {
-    minval = THTensor_(minall)(tensor);
-    maxval = THTensor_(maxall)(tensor);
-  }
-  if (minval == maxval)
-  {
-    minval = minval - 1;
-    maxval = maxval + 1;
-  }
-
-  h_data = THTensor_(data)(hist);
-
-  TH_TENSOR_APPLY(real, tensor,
-    if (*tensor_data >= minval && *tensor_data <= maxval) {
-      const int bin = (int)((*tensor_data-minval) / (maxval-minval) * nbins);
-      h_data[THMin(bin, nbins-1)] += 1;
-    }
-  );
-}
-
-void THTensor_(bhistc)(THTensor *hist, THTensor *tensor, long nbins, real minvalue, real maxvalue)
-{
-  THArgCheck(THTensor_(nDimension)(tensor) < 3, 2, "invalid dimension %d, the input must be a 2d tensor", THTensor_(nDimension)(tensor));
-
-  int dimension = 1;
-  THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(tensor), 2, "invalid dimension %d",
-      dimension + TH_INDEX_BASE);
-
-  real minval;
-  real maxval;
-  real *h_data;
-
-  THTensor_(resize2d)(hist, tensor->size[0], nbins);
-  THTensor_(zero)(hist);
-
-  minval = minvalue;
-  maxval = maxvalue;
-  if (minval == maxval)
-  {
-    minval = THTensor_(minall)(tensor);
-    maxval = THTensor_(maxall)(tensor);
-  }
-  if (minval == maxval)
-  {
-    minval = minval - 1;
-    maxval = maxval + 1;
-  }
-
-  TH_TENSOR_DIM_APPLY2(real, tensor, real, hist, dimension, long i;
-                        for(i = 0; i < tensor_size; i++)
-                        {
-                          if(tensor_data[i*tensor_stride] >= minval && tensor_data[i*tensor_stride] <= maxval) {
-                            const int bin = (int)((tensor_data[i*tensor_stride]-minval) / (maxval-minval) * nbins);
-                            hist_data[THMin(bin, nbins-1)] += 1;
-                          }
-                        }
-  );
-}
-
-#undef TH_MATH_NAME
-#endif /* floating point only part */
-#undef IS_NONZERO
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/THTensorMath.h b/contrib/lua-torch/torch7/lib/TH/generic/THTensorMath.h
deleted file mode 100644
index 17e54ccf6..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/THTensorMath.h
+++ /dev/null
@@ -1,198 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/THTensorMath.h"
-#else
-
-TH_API void THTensor_(fill)(THTensor *r_, real value);
-TH_API void THTensor_(zero)(THTensor *r_);
-
-TH_API void THTensor_(maskedFill)(THTensor *tensor, THByteTensor *mask, real value);
-TH_API void THTensor_(maskedCopy)(THTensor *tensor, THByteTensor *mask, THTensor* src);
-TH_API void THTensor_(maskedSelect)(THTensor *tensor, THTensor* src, THByteTensor *mask);
-
-TH_API void THTensor_(nonzero)(THLongTensor *subscript, THTensor *tensor);
-
-TH_API void THTensor_(indexSelect)(THTensor *tensor, THTensor *src, int dim, THLongTensor *index);
-TH_API void THTensor_(indexCopy)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src);
-TH_API void THTensor_(indexAdd)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src);
-TH_API void THTensor_(indexFill)(THTensor *tensor, int dim, THLongTensor *index, real val);
-
-TH_API void THTensor_(gather)(THTensor *tensor, THTensor *src, int dim, THLongTensor *index);
-TH_API void THTensor_(scatter)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src);
-TH_API void THTensor_(scatterAdd)(THTensor *tensor, int dim, THLongTensor *index, THTensor *src);
-TH_API void THTensor_(scatterFill)(THTensor *tensor, int dim, THLongTensor *index, real val);
-
-TH_API accreal THTensor_(dot)(THTensor *t, THTensor *src);
-
-TH_API real THTensor_(minall)(THTensor *t);
-TH_API real THTensor_(maxall)(THTensor *t);
-TH_API real THTensor_(medianall)(THTensor *t);
-TH_API accreal THTensor_(sumall)(THTensor *t);
-TH_API accreal THTensor_(prodall)(THTensor *t);
-
-TH_API void THTensor_(neg)(THTensor *self, THTensor *src);
-TH_API void THTensor_(cinv)(THTensor *self, THTensor *src);
-
-TH_API void THTensor_(add)(THTensor *r_, THTensor *t, real value);
-TH_API void THTensor_(sub)(THTensor *self, THTensor *src, real value);
-TH_API void THTensor_(mul)(THTensor *r_, THTensor *t, real value);
-TH_API void THTensor_(div)(THTensor *r_, THTensor *t, real value);
-TH_API void THTensor_(lshift)(THTensor *r_, THTensor *t, real value);
-TH_API void THTensor_(rshift)(THTensor *r_, THTensor *t, real value);
-TH_API void THTensor_(fmod)(THTensor *r_, THTensor *t, real value);
-TH_API void THTensor_(remainder)(THTensor *r_, THTensor *t, real value);
-TH_API void THTensor_(clamp)(THTensor *r_, THTensor *t, real min_value, real max_value);
-TH_API void THTensor_(bitand)(THTensor *r_, THTensor *t, real value);
-TH_API void THTensor_(bitor)(THTensor *r_, THTensor *t, real value);
-TH_API void THTensor_(bitxor)(THTensor *r_, THTensor *t, real value);
-
-TH_API void THTensor_(cadd)(THTensor *r_, THTensor *t, real value, THTensor *src);
-TH_API void THTensor_(csub)(THTensor *self, THTensor *src1, real value, THTensor *src2);
-TH_API void THTensor_(cmul)(THTensor *r_, THTensor *t, THTensor *src);
-TH_API void THTensor_(cpow)(THTensor *r_, THTensor *t, THTensor *src);
-TH_API void THTensor_(cdiv)(THTensor *r_, THTensor *t, THTensor *src);
-TH_API void THTensor_(clshift)(THTensor *r_, THTensor *t, THTensor *src);
-TH_API void THTensor_(crshift)(THTensor *r_, THTensor *t, THTensor *src);
-TH_API void THTensor_(cfmod)(THTensor *r_, THTensor *t, THTensor *src);
-TH_API void THTensor_(cremainder)(THTensor *r_, THTensor *t, THTensor *src);
-TH_API void THTensor_(cbitand)(THTensor *r_, THTensor *t, THTensor *src);
-TH_API void THTensor_(cbitor)(THTensor *r_, THTensor *t, THTensor *src);
-TH_API void THTensor_(cbitxor)(THTensor *r_, THTensor *t, THTensor *src);
-
-TH_API void THTensor_(addcmul)(THTensor *r_, THTensor *t, real value, THTensor *src1, THTensor *src2);
-TH_API void THTensor_(addcdiv)(THTensor *r_, THTensor *t, real value, THTensor *src1, THTensor *src2);
-
-TH_API void THTensor_(addmv)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *mat,  THTensor *vec);
-TH_API void THTensor_(addmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *mat1, THTensor *mat2);
-TH_API void THTensor_(addr)(THTensor *r_,  real beta, THTensor *t, real alpha, THTensor *vec1, THTensor *vec2);
-
-TH_API void THTensor_(addbmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *batch1, THTensor *batch2);
-TH_API void THTensor_(baddbmm)(THTensor *r_, real beta, THTensor *t, real alpha, THTensor *batch1, THTensor *batch2);
-
-TH_API void THTensor_(match)(THTensor *r_, THTensor *m1, THTensor *m2, real gain);
-
-TH_API ptrdiff_t THTensor_(numel)(THTensor *t);
-TH_API void THTensor_(max)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim);
-TH_API void THTensor_(min)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim);
-TH_API void THTensor_(kthvalue)(THTensor *values_, THLongTensor *indices_, THTensor *t, long k, int dimension, int keepdim);
-TH_API void THTensor_(mode)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim);
-TH_API void THTensor_(median)(THTensor *values_, THLongTensor *indices_, THTensor *t, int dimension, int keepdim);
-TH_API void THTensor_(sum)(THTensor *r_, THTensor *t, int dimension, int keepdim);
-TH_API void THTensor_(prod)(THTensor *r_, THTensor *t, int dimension, int keepdim);
-TH_API void THTensor_(cumsum)(THTensor *r_, THTensor *t, int dimension);
-TH_API void THTensor_(cumprod)(THTensor *r_, THTensor *t, int dimension);
-TH_API void THTensor_(sign)(THTensor *r_, THTensor *t);
-TH_API accreal THTensor_(trace)(THTensor *t);
-TH_API void THTensor_(cross)(THTensor *r_, THTensor *a, THTensor *b, int dimension);
-
-TH_API void THTensor_(cmax)(THTensor *r, THTensor *t, THTensor *src);
-TH_API void THTensor_(cmin)(THTensor *r, THTensor *t, THTensor *src);
-TH_API void THTensor_(cmaxValue)(THTensor *r, THTensor *t, real value);
-TH_API void THTensor_(cminValue)(THTensor *r, THTensor *t, real value);
-
-TH_API void THTensor_(zeros)(THTensor *r_, THLongStorage *size);
-TH_API void THTensor_(ones)(THTensor *r_, THLongStorage *size);
-TH_API void THTensor_(diag)(THTensor *r_, THTensor *t, int k);
-TH_API void THTensor_(eye)(THTensor *r_, long n, long m);
-TH_API void THTensor_(arange)(THTensor *r_, accreal xmin, accreal xmax, accreal step);
-TH_API void THTensor_(range)(THTensor *r_, accreal xmin, accreal xmax, accreal step);
-TH_API void THTensor_(randperm)(THTensor *r_, THGenerator *_generator, long n);
-
-TH_API void THTensor_(reshape)(THTensor *r_, THTensor *t, THLongStorage *size);
-TH_API void THTensor_(sort)(THTensor *rt_, THLongTensor *ri_, THTensor *t, int dimension, int descendingOrder);
-TH_API void THTensor_(topk)(THTensor *rt_, THLongTensor *ri_, THTensor *t, long k, int dim, int dir, int sorted);
-TH_API void THTensor_(tril)(THTensor *r_, THTensor *t, long k);
-TH_API void THTensor_(triu)(THTensor *r_, THTensor *t, long k);
-TH_API void THTensor_(cat)(THTensor *r_, THTensor *ta, THTensor *tb, int dimension);
-TH_API void THTensor_(catArray)(THTensor *result, THTensor **inputs, int numInputs, int dimension);
-
-TH_API int THTensor_(equal)(THTensor *ta, THTensor *tb);
-
-TH_API void THTensor_(ltValue)(THByteTensor *r_, THTensor* t, real value);
-TH_API void THTensor_(leValue)(THByteTensor *r_, THTensor* t, real value);
-TH_API void THTensor_(gtValue)(THByteTensor *r_, THTensor* t, real value);
-TH_API void THTensor_(geValue)(THByteTensor *r_, THTensor* t, real value);
-TH_API void THTensor_(neValue)(THByteTensor *r_, THTensor* t, real value);
-TH_API void THTensor_(eqValue)(THByteTensor *r_, THTensor* t, real value);
-
-TH_API void THTensor_(ltValueT)(THTensor *r_, THTensor* t, real value);
-TH_API void THTensor_(leValueT)(THTensor *r_, THTensor* t, real value);
-TH_API void THTensor_(gtValueT)(THTensor *r_, THTensor* t, real value);
-TH_API void THTensor_(geValueT)(THTensor *r_, THTensor* t, real value);
-TH_API void THTensor_(neValueT)(THTensor *r_, THTensor* t, real value);
-TH_API void THTensor_(eqValueT)(THTensor *r_, THTensor* t, real value);
-
-TH_API void THTensor_(ltTensor)(THByteTensor *r_, THTensor *ta, THTensor *tb);
-TH_API void THTensor_(leTensor)(THByteTensor *r_, THTensor *ta, THTensor *tb);
-TH_API void THTensor_(gtTensor)(THByteTensor *r_, THTensor *ta, THTensor *tb);
-TH_API void THTensor_(geTensor)(THByteTensor *r_, THTensor *ta, THTensor *tb);
-TH_API void THTensor_(neTensor)(THByteTensor *r_, THTensor *ta, THTensor *tb);
-TH_API void THTensor_(eqTensor)(THByteTensor *r_, THTensor *ta, THTensor *tb);
-
-TH_API void THTensor_(ltTensorT)(THTensor *r_, THTensor *ta, THTensor *tb);
-TH_API void THTensor_(leTensorT)(THTensor *r_, THTensor *ta, THTensor *tb);
-TH_API void THTensor_(gtTensorT)(THTensor *r_, THTensor *ta, THTensor *tb);
-TH_API void THTensor_(geTensorT)(THTensor *r_, THTensor *ta, THTensor *tb);
-TH_API void THTensor_(neTensorT)(THTensor *r_, THTensor *ta, THTensor *tb);
-TH_API void THTensor_(eqTensorT)(THTensor *r_, THTensor *ta, THTensor *tb);
-
-#if defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT) || defined(TH_REAL_IS_LONG)
-TH_API void THTensor_(abs)(THTensor *r_, THTensor *t);
-#endif
-
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-
-TH_API void THTensor_(sigmoid)(THTensor *r_, THTensor *t);
-TH_API void THTensor_(log)(THTensor *r_, THTensor *t);
-TH_API void THTensor_(lgamma)(THTensor *r_, THTensor *t);
-TH_API void THTensor_(log1p)(THTensor *r_, THTensor *t);
-TH_API void THTensor_(exp)(THTensor *r_, THTensor *t);
-TH_API void THTensor_(cos)(THTensor *r_, THTensor *t);
-TH_API void THTensor_(acos)(THTensor *r_, THTensor *t);
-TH_API void THTensor_(cosh)(THTensor *r_, THTensor *t);
-TH_API void THTensor_(sin)(THTensor *r_, THTensor *t);
-TH_API void THTensor_(asin)(THTensor *r_, THTensor *t);
-TH_API void THTensor_(sinh)(THTensor *r_, THTensor *t);
-TH_API void THTensor_(tan)(THTensor *r_, THTensor *t);
-TH_API void THTensor_(atan)(THTensor *r_, THTensor *t);
-TH_API void THTensor_(atan2)(THTensor *r_, THTensor *tx, THTensor *ty);
-TH_API void THTensor_(tanh)(THTensor *r_, THTensor *t);
-TH_API void THTensor_(pow)(THTensor *r_, THTensor *t, real value);
-TH_API void THTensor_(tpow)(THTensor *r_, real value, THTensor *t);
-TH_API void THTensor_(sqrt)(THTensor *r_, THTensor *t);
-TH_API void THTensor_(rsqrt)(THTensor *r_, THTensor *t);
-TH_API void THTensor_(ceil)(THTensor *r_, THTensor *t);
-TH_API void THTensor_(floor)(THTensor *r_, THTensor *t);
-TH_API void THTensor_(round)(THTensor *r_, THTensor *t);
-TH_API void THTensor_(abs)(THTensor *r_, THTensor *t);
-TH_API void THTensor_(trunc)(THTensor *r_, THTensor *t);
-TH_API void THTensor_(frac)(THTensor *r_, THTensor *t);
-TH_API void THTensor_(lerp)(THTensor *r_, THTensor *a, THTensor *b, real weight);
-
-TH_API void THTensor_(mean)(THTensor *r_, THTensor *t, int dimension, int keepdim);
-TH_API void THTensor_(std)(THTensor *r_, THTensor *t, int dimension, int flag, int keepdim);
-TH_API void THTensor_(var)(THTensor *r_, THTensor *t, int dimension, int flag, int keepdim);
-TH_API void THTensor_(norm)(THTensor *r_, THTensor *t, real value, int dimension, int keepdim);
-TH_API void THTensor_(renorm)(THTensor *r_, THTensor *t, real value, int dimension, real maxnorm);
-TH_API accreal THTensor_(dist)(THTensor *a, THTensor *b, real value);
-TH_API void THTensor_(histc)(THTensor *hist, THTensor *tensor, long nbins, real minvalue, real maxvalue);
-TH_API void THTensor_(bhistc)(THTensor *hist, THTensor *tensor, long nbins, real minvalue, real maxvalue);
-
-TH_API accreal THTensor_(meanall)(THTensor *self);
-TH_API accreal THTensor_(varall)(THTensor *self);
-TH_API accreal THTensor_(stdall)(THTensor *self);
-TH_API accreal THTensor_(normall)(THTensor *t, real value);
-
-TH_API void THTensor_(linspace)(THTensor *r_, real a, real b, long n);
-TH_API void THTensor_(logspace)(THTensor *r_, real a, real b, long n);
-TH_API void THTensor_(rand)(THTensor *r_, THGenerator *_generator, THLongStorage *size);
-TH_API void THTensor_(randn)(THTensor *r_, THGenerator *_generator, THLongStorage *size);
-#endif
-
-#if defined(TH_REAL_IS_BYTE)
-
-TH_API int THTensor_(logicalall)(THTensor *self);
-TH_API int THTensor_(logicalany)(THTensor *self);
-
-#endif /* TH_REAL_IS_BYTE */
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/THTensorRandom.c b/contrib/lua-torch/torch7/lib/TH/generic/THTensorRandom.c
deleted file mode 100644
index 514d3dd27..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/THTensorRandom.c
+++ /dev/null
@@ -1,250 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/THTensorRandom.c"
-#else
-
-void THTensor_(random)(THTensor *self, THGenerator *_generator)
-{
-#if defined(TH_REAL_IS_BYTE)
-  TH_TENSOR_APPLY(real, self, *self_data = (unsigned char)(THRandom_random(_generator) % (UCHAR_MAX+1)););
-#elif defined(TH_REAL_IS_CHAR)
-  TH_TENSOR_APPLY(real, self, *self_data = (char)(THRandom_random(_generator) % (CHAR_MAX+1)););
-#elif defined(TH_REAL_IS_SHORT)
-  TH_TENSOR_APPLY(real, self, *self_data = (short)(THRandom_random(_generator) % (SHRT_MAX+1)););
-#elif defined(TH_REAL_IS_INT)
-  TH_TENSOR_APPLY(real, self, *self_data = (int)(THRandom_random(_generator) % (INT_MAX+1UL)););
-#elif defined(TH_REAL_IS_LONG)
-  TH_TENSOR_APPLY(real, self, *self_data = (long)(THRandom_random(_generator) % (LONG_MAX+1UL)););
-#elif defined(TH_REAL_IS_FLOAT)
-  TH_TENSOR_APPLY(real, self, *self_data = (float)(THRandom_random(_generator) % ((1UL << FLT_MANT_DIG)+1)););
-#elif defined(TH_REAL_IS_DOUBLE)
-  TH_TENSOR_APPLY(real, self, *self_data = (double)(THRandom_random(_generator) % ((1ULL << DBL_MANT_DIG)+1)););
-#else
-#error "Unknown type"
-#endif
-}
-
-void THTensor_(geometric)(THTensor *self, THGenerator *_generator, double p)
-{
-  TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_geometric(_generator, p););
-}
-
-void THTensor_(bernoulli)(THTensor *self, THGenerator *_generator, double p)
-{
-  TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_bernoulli(_generator, p););
-}
-
-void THTensor_(bernoulli_FloatTensor)(THTensor *self, THGenerator *_generator, THFloatTensor *p)
-{
-  TH_TENSOR_APPLY2(real, self, float, p, *self_data = (real)THRandom_bernoulli(_generator, (double)*p_data););
-}
-
-void THTensor_(bernoulli_DoubleTensor)(THTensor *self, THGenerator *_generator, THDoubleTensor *p)
-{
-  TH_TENSOR_APPLY2(real, self, double, p, *self_data = (real)THRandom_bernoulli(_generator, (double)*p_data););
-}
-
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-
-void THTensor_(uniform)(THTensor *self, THGenerator *_generator, double a, double b)
-{
-  TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_uniform(_generator, a, b););
-}
-
-void THTensor_(normal)(THTensor *self, THGenerator *_generator, double mean, double stdv)
-{
-  TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_normal(_generator, mean, stdv););
-}
-
-void THTensor_(exponential)(THTensor *self, THGenerator *_generator, double lambda)
-{
-  TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_exponential(_generator, lambda););
-}
-
-void THTensor_(cauchy)(THTensor *self, THGenerator *_generator, double median, double sigma)
-{
-  TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_cauchy(_generator, median, sigma););
-}
-
-void THTensor_(logNormal)(THTensor *self, THGenerator *_generator, double mean, double stdv)
-{
-  TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_logNormal(_generator, mean, stdv););
-}
-
-void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTensor *prob_dist, int n_sample, int with_replacement)
-{
-  int start_dim = THTensor_(nDimension)(prob_dist);
-  long n_dist;
-  long n_categories;
-  THDoubleTensor* cum_dist;
-  int i,j,k;
-
-  if (start_dim == 1)
-  {
-    THTensor_(resize2d)(prob_dist, 1, THTensor_(size)(prob_dist, 0));
-  }
-
-  n_dist = THTensor_(size)(prob_dist, 0);
-  n_categories = THTensor_(size)(prob_dist, 1);
-
-  THArgCheck(n_sample > 0, 2, "cannot sample n_sample < 0 samples");
-
-  if (!with_replacement)
-  {
-    THArgCheck((!with_replacement) && (n_sample <= n_categories), 2, \
-    "cannot sample n_sample > prob_dist:size(1) samples without replacement");
-  }
-
-  /* cumulative probability distribution vector */
-  cum_dist = THDoubleTensor_newWithSize1d(n_categories);
-
-  /* will contain multinomial samples (category indices to be returned) */
-  THLongTensor_resize2d(self, n_dist , n_sample);
-
-  for (i=0; i<n_dist; i++)
-  {
-    /* Get normalized cumulative distribution from prob distribution */
-    double sum = 0;
-    for (j=0; j<n_categories; j++)
-    {
-      sum += THStorage_(get)( \
-        prob_dist->storage, \
-        prob_dist->storageOffset+i*prob_dist->stride[0]+j*prob_dist->stride[1] \
-      );
-      THDoubleStorage_set(
-        cum_dist->storage, \
-        cum_dist->storageOffset+j*cum_dist->stride[0], \
-        sum \
-      );
-    }
-    THArgCheckWithCleanup((sum > 0), THCleanup(THDoubleTensor_free(cum_dist);), 2,
-                          "invalid multinomial distribution (sum of probabilities <= 0)");
-    /* normalize cumulative probability distribution so that last val is 1
-    i.e. doesn't assume original prob_dist row sums to one */
-    if ( (sum > 0) || ( ( sum < 1.00001) && (sum > 0.99999) ) )
-    {
-      for (j=0; j<n_categories; j++)
-      {
-        THDoubleTensor_data(cum_dist)[j*cum_dist->stride[0]] /= sum;
-      }
-    }
-
-    for (j=0; j<n_sample; j++)
-    {
-      /* sample a probability mass from a uniform distribution */
-      double uniform_sample = THRandom_uniform(_generator, 0, 1);
-      /* Do a binary search for the slot in which the prob falls
-      ie cum_dist[row][slot-1] < uniform_prob < cum_distr[row][slot] */
-      int left_pointer = 0;
-      int right_pointer = n_categories;
-      int mid_pointer;
-      double cum_prob;
-      int sample_idx;
-      /* Make sure the last cumulative distribution bucket sums to 1 */
-      THDoubleTensor_data(cum_dist)[(n_categories-1)*cum_dist->stride[0]] = 1;
-
-      while(right_pointer - left_pointer > 0)
-      {
-          mid_pointer = left_pointer + (right_pointer - left_pointer) / 2;
-          cum_prob = THDoubleStorage_get( \
-            cum_dist->storage, \
-            cum_dist->storageOffset+mid_pointer*cum_dist->stride[0] \
-          );
-          if (cum_prob < uniform_sample)
-          {
-            left_pointer = mid_pointer + 1;
-          }
-          else
-          {
-            right_pointer = mid_pointer;
-          }
-      }
-      sample_idx = left_pointer;
-
-       /* store in result tensor (will be incremented for lua compat by wrapper) */
-      THLongStorage_set( \
-        self->storage, \
-        self->storageOffset+i*self->stride[0]+j*self->stride[1], \
-        sample_idx \
-      );
-
-      /* Once a sample is drawn, it cannot be drawn again. ie sample without replacement */
-      if (!with_replacement)
-      {
-        /* update cumulative distribution so that sample cannot be drawn again */
-        double diff;
-        double new_val = 0;
-        double sum;
-
-        if (sample_idx != 0)
-        {
-          new_val = THDoubleStorage_get( \
-            cum_dist->storage, \
-            cum_dist->storageOffset+(sample_idx-1)*cum_dist->stride[0] \
-          );
-        }
-        /* marginal cumulative mass (i.e. original probability) of sample */
-        diff = THDoubleStorage_get( \
-          cum_dist->storage, \
-          cum_dist->storageOffset+sample_idx*cum_dist->stride[0] \
-        ) - new_val;
-        /* new sum of marginals is not one anymore... */
-        sum = 1.0 - diff;
-        for (k=0; k<n_categories; k++)
-        {
-          new_val = THDoubleStorage_get( \
-            cum_dist->storage, \
-            cum_dist->storageOffset+k*cum_dist->stride[0] \
-          );
-          if (k >= sample_idx)
-          {
-            /* remove sampled probability mass from later cumulative probabilities */
-            new_val -= diff;
-          }
-          /* make total marginals sum to one */
-          new_val /= sum;
-          THDoubleStorage_set( \
-            cum_dist->storage, \
-            cum_dist->storageOffset+k*cum_dist->stride[0], \
-            new_val \
-          );
-        }
-      }
-    }
-  }
-
-  THDoubleTensor_free(cum_dist);
-
-  if (start_dim == 1)
-  {
-    THLongTensor_resize1d(self, n_sample);
-    THTensor_(resize1d)(prob_dist, n_categories);
-  }
-}
-
-#endif
-
-#if defined(TH_REAL_IS_BYTE)
-void THTensor_(getRNGState)(THGenerator *_generator, THTensor *self)
-{
-  static const size_t size = sizeof(THGenerator);
-  THGenerator *rng_state;
-  THTensor_(resize1d)(self, size);
-  THArgCheck(THTensor_(nElement)(self) == size, 1, "RNG state is wrong size");
-  THArgCheck(THTensor_(isContiguous)(self), 1, "RNG state needs to be contiguous");
-  rng_state = (THGenerator *)THTensor_(data)(self);
-  THGenerator_copy(rng_state, _generator);
-}
-
-void THTensor_(setRNGState)(THGenerator *_generator, THTensor *self)
-{
-  static const size_t size = sizeof(THGenerator);
-  THGenerator *rng_state;
-  THArgCheck(THTensor_(nElement)(self) == size, 1, "RNG state is wrong size");
-  THArgCheck(THTensor_(isContiguous)(self), 1, "RNG state needs to be contiguous");
-  rng_state = (THGenerator *)THTensor_(data)(self);
-  THArgCheck(THGenerator_isValid(rng_state), 1, "Invalid RNG state");
-  THGenerator_copy(_generator, rng_state);
-}
-#endif
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/THTensorRandom.h b/contrib/lua-torch/torch7/lib/TH/generic/THTensorRandom.h
deleted file mode 100644
index d20514242..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/THTensorRandom.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/THTensorRandom.h"
-#else
-
-TH_API void THTensor_(random)(THTensor *self, THGenerator *_generator);
-TH_API void THTensor_(geometric)(THTensor *self, THGenerator *_generator, double p);
-TH_API void THTensor_(bernoulli)(THTensor *self, THGenerator *_generator, double p);
-TH_API void THTensor_(bernoulli_FloatTensor)(THTensor *self, THGenerator *_generator, THFloatTensor *p);
-TH_API void THTensor_(bernoulli_DoubleTensor)(THTensor *self, THGenerator *_generator, THDoubleTensor *p);
-
-#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
-TH_API void THTensor_(uniform)(THTensor *self, THGenerator *_generator, double a, double b);
-TH_API void THTensor_(normal)(THTensor *self, THGenerator *_generator, double mean, double stdv);
-TH_API void THTensor_(exponential)(THTensor *self, THGenerator *_generator, double lambda);
-TH_API void THTensor_(cauchy)(THTensor *self, THGenerator *_generator, double median, double sigma);
-TH_API void THTensor_(logNormal)(THTensor *self, THGenerator *_generator, double mean, double stdv);
-TH_API void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTensor *prob_dist, int n_sample, int with_replacement);
-#endif
-
-#if defined(TH_REAL_IS_BYTE)
-TH_API void THTensor_(getRNGState)(THGenerator *_generator, THTensor *self);
-TH_API void THTensor_(setRNGState)(THGenerator *_generator, THTensor *self);
-#endif
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/THVector.h b/contrib/lua-torch/torch7/lib/TH/generic/THVector.h
deleted file mode 100644
index 7d368541a..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/THVector.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/THVector.h"
-#else
-
-TH_API void THVector_(fill)(real *x, const real c, const ptrdiff_t n);
-TH_API void THVector_(cadd)(real *z, const real *x, const real *y, const real c, const ptrdiff_t n);
-TH_API void THVector_(adds)(real *y, const real *x, const real c, const ptrdiff_t n);
-TH_API void THVector_(cmul)(real *z, const real *x, const real *y, const ptrdiff_t n);
-TH_API void THVector_(muls)(real *y, const real *x, const real c, const ptrdiff_t n);
-TH_API void THVector_(cdiv)(real *z, const real *x, const real *y, const ptrdiff_t n);
-TH_API void THVector_(divs)(real *y, const real *x, const real c, const ptrdiff_t n);
-TH_API void THVector_(copy)(real *y, const real *x, const ptrdiff_t n);
-
-/* Initialize the dispatch pointers */
-TH_API void THVector_(vectorDispatchInit)(void);
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/THVectorDefault.c b/contrib/lua-torch/torch7/lib/TH/generic/THVectorDefault.c
deleted file mode 100644
index 3388e0d9b..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/THVectorDefault.c
+++ /dev/null
@@ -1,131 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/THVectorDefault.c"
-#else
-
-void THVector_(copy_DEFAULT)(real *x, const real *y, const ptrdiff_t n) {
-  ptrdiff_t i = 0;
-
-  for(; i <n-4; i+=4)
-  {
-    x[i] = y[i];
-    x[i+1] = y[i+1];
-    x[i+2] = y[i+2];
-    x[i+3] = y[i+3];
-  }
-
-  for(; i < n; i++)
-    x[i] = y[i];
-}
-
-void THVector_(fill_DEFAULT)(real *x, const real c, const ptrdiff_t n) {
-  ptrdiff_t i = 0;
-
-  for(; i <n-4; i+=4)
-  {
-    x[i] = c;
-    x[i+1] = c;
-    x[i+2] = c;
-    x[i+3] = c;
-  }
-
-  for(; i < n; i++)
-    x[i] = c;
-}
-
-void THVector_(cadd_DEFAULT)(real *z, const real *x, const real *y, const real c, const ptrdiff_t n)
-{
-  ptrdiff_t i = 0;
-
-  for(; i<n-4; i+=4)
-  {
-    z[i] = x[i] + c * y[i];
-    z[i+1] = x[i+1] + c * y[i+1];
-    z[i+2] = x[i+2] + c * y[i+2];
-    z[i+3] = x[i+3] + c * y[i+3];
-  }
-
-  for(; i<n; i++)
-    z[i] = x[i] + c * y[i];
-}
-
-void THVector_(adds_DEFAULT)(real *y, const real *x, const real c, const ptrdiff_t n)
-{
-  ptrdiff_t i = 0;
-
-  for(; i<n-4; i+=4)
-  {
-    y[i] = x[i] + c;
-    y[i+1] = x[i+1] + c;
-    y[i+2] = x[i+2] + c;
-    y[i+3] = x[i+3] + c;
-  }
-
-  for(; i<n; i++)
-    y[i] = x[i] + c;
-}
-
-void THVector_(cmul_DEFAULT)(real *z, const real *x, const real *y, const ptrdiff_t n)
-{
-  ptrdiff_t i = 0;
-
-  for(; i <n-4; i+=4)
-  {
-    z[i] = x[i] * y[i];
-    z[i+1] = x[i+1] * y[i+1];
-    z[i+2] = x[i+2] * y[i+2];
-    z[i+3] = x[i+3] * y[i+3];
-  }
-
-  for(; i < n; i++)
-    z[i] = x[i] * y[i];
-}
-
-void THVector_(muls_DEFAULT)(real *y, const real *x, const real c, const ptrdiff_t n)
-{
-  ptrdiff_t i = 0;
-
-  for(; i <n-4; i+=4)
-  {
-    y[i] = x[i] * c;
-    y[i+1] = x[i+1] * c;
-    y[i+2] = x[i+2] * c;
-    y[i+3] = x[i+3] * c;
-  }
-
-  for(; i < n; i++)
-    y[i] = x[i] * c;
-}
-
-void THVector_(cdiv_DEFAULT)(real *z, const real *x, const real *y, const ptrdiff_t n)
-{
-  ptrdiff_t i = 0;
-
-  for(; i<n-4; i+=4)
-  {
-    z[i] = x[i] / y[i];
-    z[i+1] = x[i+1] / y[i+1];
-    z[i+2] = x[i+2] / y[i+2];
-    z[i+3] = x[i+3] / y[i+3];
-  }
-
-  for(; i < n; i++)
-    z[i] = x[i] / y[i];
-}
-
-void THVector_(divs_DEFAULT)(real *y, const real *x, const real c, const ptrdiff_t n)
-{
-  ptrdiff_t i = 0;
-
-  for(; i<n-4; i+=4)
-  {
-    y[i] = x[i] / c;
-    y[i+1] = x[i+1] / c;
-    y[i+2] = x[i+2] / c;
-    y[i+3] = x[i+3] / c;
-  }
-
-  for(; i < n; i++)
-    y[i] = x[i] / c;
-}
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/THVectorDispatch.c b/contrib/lua-torch/torch7/lib/TH/generic/THVectorDispatch.c
deleted file mode 100644
index 5b8885283..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/THVectorDispatch.c
+++ /dev/null
@@ -1,262 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/THVectorDispatch.c"
-#else
-
-/* For now there are only SIMD implementations for FLOAT and DOUBLE.
- * Hopefully in the future this can be made totally generic (e.g, there are SIMD implementations
- * for a lot of functions */
-/* Each function with multiple implementations has:
- * 1. A DISPATCHPTR which will be initialized to point to the best available implementation for the host
- * 2. A DISPATCHTABLE which holds pointers to each implementation of a function, and a value indicating
- *    which SIMD extension a given implementation uses
- * 3. A dispatch stub, which is what is actually called by clients, that simply wraps the dispatch pointer.
- */
-
-static void (*THVector_(fill_DISPATCHPTR))(real *, const real, const ptrdiff_t) = &THVector_(fill_DEFAULT);
-static FunctionDescription THVector_(fill_DISPATCHTABLE)[] = {
-  #if defined(__NEON__)
-    #if defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(fill_NEON), SIMDExtension_NEON),
-    #endif
-  #endif
-
-  #if defined(__PPC64__)
-    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(fill_VSX), SIMDExtension_VSX),
-    #endif
-  #endif
-
-  #if defined(USE_AVX)
-    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(fill_AVX), SIMDExtension_AVX),
-    #endif
-  #endif
-
-  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
-          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
-    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(fill_SSE), SIMDExtension_SSE),
-    #endif
-  #endif
-  FUNCTION_IMPL(THVector_(fill_DEFAULT), SIMDExtension_DEFAULT)
-};
-void THVector_(fill)(real *x, const real c, const ptrdiff_t n) {
-  THVector_(fill_DISPATCHPTR)(x, c, n);
-}
-
-static void (*THVector_(cadd_DISPATCHPTR))(real *, const real *, const real *, const real, const ptrdiff_t) = &THVector_(cadd_DEFAULT);
-static FunctionDescription THVector_(cadd_DISPATCHTABLE)[] = {
-  #if defined(__NEON__)
-    #if defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(cadd_NEON), SIMDExtension_NEON),
-    #endif
-  #endif
-
-  #if defined(USE_AVX2)
-    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(cadd_AVX2), SIMDExtension_AVX2),
-    #endif
-  #endif
-
-  #if defined(USE_AVX)
-    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(cadd_AVX), SIMDExtension_AVX),
-    #endif
-  #endif
-
-  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
-          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
-    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(cadd_SSE), SIMDExtension_SSE),
-    #endif
-  #endif
-
-  FUNCTION_IMPL(THVector_(cadd_DEFAULT), SIMDExtension_DEFAULT)
-};
-void THVector_(cadd)(real *z, const real *x, const real *y, const real c, const ptrdiff_t n) {
-  THVector_(cadd_DISPATCHPTR)(z, x, y, c, n);
-}
-
-static void (*THVector_(adds_DISPATCHPTR))(real *, const real *, const real, const ptrdiff_t) = &THVector_(adds_DEFAULT);
-static FunctionDescription THVector_(adds_DISPATCHTABLE)[] = {
-  #if defined(__NEON__)
-    #if defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(adds_NEON), SIMDExtension_NEON),
-    #endif
-  #endif
-
-  #if defined(__PPC64__)
-    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(adds_VSX), SIMDExtension_VSX),
-    #endif
-  #endif
-
-  #if defined(USE_AVX)
-    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(adds_AVX), SIMDExtension_AVX),
-    #endif
-  #endif
-
-  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
-          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
-    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(adds_SSE), SIMDExtension_SSE),
-    #endif
-  #endif
-
-  FUNCTION_IMPL(THVector_(adds_DEFAULT), SIMDExtension_DEFAULT)
-};
-// Dispatch stubs that just call the pointers
-TH_API void THVector_(adds)(real *r_, const real *t, const real value, const ptrdiff_t n) {
-  THVector_(adds_DISPATCHPTR)(r_, t, value, n);
-}
-
-static void (*THVector_(cmul_DISPATCHPTR))(real *, const real *, const real *, const ptrdiff_t) = &THVector_(cmul_DEFAULT);
-static FunctionDescription THVector_(cmul_DISPATCHTABLE)[] = {
-  #if defined(__NEON__)
-    #if defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(cmul_NEON), SIMDExtension_NEON),
-    #endif
-  #endif
-
-  #if defined(USE_AVX)
-    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(cmul_AVX), SIMDExtension_AVX),
-    #endif
-  #endif
-
-  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
-          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
-    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(cmul_SSE), SIMDExtension_SSE),
-    #endif
-  #endif
-
-  FUNCTION_IMPL(THVector_(cmul_DEFAULT), SIMDExtension_DEFAULT)
-};
-void THVector_(cmul)(real *z, const real *x, const real *y, const ptrdiff_t n) {
-  THVector_(cmul_DISPATCHPTR)(z, x, y, n);
-}
-
-static void (*THVector_(muls_DISPATCHPTR))(real *, const real *, const real, const ptrdiff_t) = &THVector_(muls_DEFAULT);
-static FunctionDescription THVector_(muls_DISPATCHTABLE)[] = {
-  #if defined(__NEON__)
-    #if defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(muls_NEON), SIMDExtension_NEON),
-    #endif
-  #endif
-
-  #if defined(__PPC64__)
-    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(muls_VSX), SIMDExtension_VSX),
-    #endif
-  #endif
-
-  #if defined(USE_AVX)
-    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(muls_AVX), SIMDExtension_AVX),
-    #endif
-  #endif
-
-  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
-          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
-    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(muls_SSE), SIMDExtension_SSE),
-    #endif
-  #endif
-
-  FUNCTION_IMPL(THVector_(muls_DEFAULT), SIMDExtension_DEFAULT)
-};
-void THVector_(muls)(real *y, const real *x, const real c, const ptrdiff_t n) {
-  THVector_(muls_DISPATCHPTR)(y, x, c, n);
-}
-
-static void (*THVector_(cdiv_DISPATCHPTR))(real *, const real *, const real *, const ptrdiff_t) = &THVector_(cdiv_DEFAULT);
-static FunctionDescription THVector_(cdiv_DISPATCHTABLE)[] = {
-  #if defined(__NEON__)
-    #if defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(cdiv_NEON), SIMDExtension_NEON),
-    #endif
-  #endif
-
-  #if defined(USE_AVX)
-    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(cdiv_AVX), SIMDExtension_AVX),
-    #endif
-  #endif
-
-  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
-          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
-    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(cdiv_SSE), SIMDExtension_SSE),
-    #endif
-  #endif
-
-  FUNCTION_IMPL(THVector_(cdiv_DEFAULT), SIMDExtension_DEFAULT)
-};
-void THVector_(cdiv)(real *z, const real *x, const real *y, const ptrdiff_t n) {
-  THVector_(cdiv_DISPATCHPTR)(z, x, y, n);
-}
-
-static void (*THVector_(divs_DISPATCHPTR))(real *, const real *, const real, const ptrdiff_t) = &THVector_(divs_DEFAULT);
-static FunctionDescription THVector_(divs_DISPATCHTABLE)[] = {
-  #if defined(__NEON__)
-    #if defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(divs_NEON), SIMDExtension_NEON),
-    #endif
-  #endif
-
-  #if defined(USE_AVX)
-    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(divs_AVX), SIMDExtension_AVX),
-    #endif
-  #endif
-
-  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
-          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
-    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(divs_SSE), SIMDExtension_SSE),
-    #endif
-  #endif
-
-  FUNCTION_IMPL(THVector_(divs_DEFAULT), SIMDExtension_DEFAULT)
-};
-void THVector_(divs)(real *y, const real *x, const real c, const ptrdiff_t n) {
-  THVector_(divs_DISPATCHPTR)(y, x, c, n);
-}
-
-static void (*THVector_(copy_DISPATCHPTR))(real *, const real *, const ptrdiff_t) = &THVector_(copy_DEFAULT);
-static FunctionDescription THVector_(copy_DISPATCHTABLE)[] = {
-  #if defined(USE_AVX)
-    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
-      FUNCTION_IMPL(THVector_(copy_AVX), SIMDExtension_AVX),
-    #endif
-  #endif
-
-  FUNCTION_IMPL(THVector_(copy_DEFAULT), SIMDExtension_DEFAULT)
-};
-void THVector_(copy)(real *y, const real *x, const ptrdiff_t n) {
-  THVector_(copy_DISPATCHPTR)(y, x, n);
-}
-
-/* This needs to be called in order to initialize the dispatch pointers at runtime.
- * This function simply checks what SIMD extensions are available, and then walks the dispatch table
- * to choose the best function.
- * NOTE: As implemented, it will initialize the dispatch pointer to the first supported function.
- *       This means that in the dispatch tables, implementations supporting more recent extensions
- *       need to come first
- */
-void THVector_(vectorDispatchInit)(void)
-{
-  uint32_t hostSimdExts = detectHostSIMDExtensions();
-  INIT_DISPATCH_PTR(fill);
-  INIT_DISPATCH_PTR(cadd);
-  INIT_DISPATCH_PTR(adds);
-  INIT_DISPATCH_PTR(cmul);
-  INIT_DISPATCH_PTR(muls);
-  INIT_DISPATCH_PTR(cdiv);
-  INIT_DISPATCH_PTR(divs);
-  INIT_DISPATCH_PTR(copy);
-}
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/simd/common_simd.h b/contrib/lua-torch/torch7/lib/TH/generic/simd/common_simd.h
deleted file mode 100644
index 425b4b96e..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/simd/common_simd.h
+++ /dev/null
@@ -1,395 +0,0 @@
-#ifndef COMMON_SIMD_H
-#define COMMON_SIMD_H
-
-/* Weights */
-#define LOAD_WEIGHT(q, simd_type, inst_var) _m ## simd_type ## inst_var(*(q))
-
-#define DECLARE_WEIGHTS(simd_type) \
-__ ## simd_type weight0; \
-__ ## simd_type weight1; \
-__ ## simd_type weight2; \
-__ ## simd_type weight3; \
-__ ## simd_type weight4;
-
-#define LOAD_WEIGHTS(k, simd_type, inst_var) \
-weight0 = LOAD_WEIGHT(weight + 5 * 0 + k, simd_type, inst_var); \
-weight1 = LOAD_WEIGHT(weight + 5 * 1 + k, simd_type, inst_var); \
-weight2 = LOAD_WEIGHT(weight + 5 * 2 + k, simd_type, inst_var); \
-weight3 = LOAD_WEIGHT(weight + 5 * 3 + k, simd_type, inst_var); \
-weight4 = LOAD_WEIGHT(weight + 5 * 4 + k, simd_type, inst_var);
-
-/* Inputs declare */
-#define DECLARE_INPUT_0(i) \
-float* input0 = image + i; \
-
-#define DECLARE_INPUT_1() \
-float* input1 = input0 + inputStride; \
-float* input2 = input1 + inputStride; \
-float* input3 = input2 + inputStride; \
-float* input4 = input3 + inputStride;
-
-#define DECLARE_INPUT_2() \
-DECLARE_INPUT_1() \
-float* input5 = input4 + inputStride;
-
-#define DECLARE_INPUT_4() \
-DECLARE_INPUT_2() \
-float* input6 = input5 + inputStride; \
-float* input7 = input6 + inputStride;
-
-#define DECLARE_INPUT_5() \
-DECLARE_INPUT_4() \
-float* input8 = input7 + inputStride;
-
-#define DECLARE_INPUT_6() \
-DECLARE_INPUT_5() \
-float* input9 = input8 + inputStride;
-
-#define DECLARE_INPUT_7() \
-DECLARE_INPUT_6() \
-float* inputA = input9 + inputStride;
-
-#define DECLARE_INPUT_8() \
-DECLARE_INPUT_7() \
-float* inputB = inputA + inputStride;
-
-
-/* Inputs increment */
-#define INC_INPUT_1()\
-input0++; \
-input1++; \
-input2++; \
-input3++; \
-input4++; \
-
-#define INC_INPUT_2()\
-INC_INPUT_1() \
-input5++;
-
-#define INC_INPUT_4()\
-INC_INPUT_2() \
-input6++; \
-input7++;
-
-#define INC_INPUT_5()\
-INC_INPUT_4() \
-input8++;
-
-#define INC_INPUT_6()\
-INC_INPUT_5() \
-input9++;
-
-#define INC_INPUT_7()\
-INC_INPUT_6() \
-inputA++;
-
-#define INC_INPUT_8()\
-INC_INPUT_7() \
-inputB++;
-
-/* Outputs declare */
-#define DECLARE_OUTPUT_1() \
-float* output0 = output;
-
-#define DECLARE_OUTPUT_2() \
-DECLARE_OUTPUT_1() \
-float* output1 = output0 + outputStride;
-
-#define DECLARE_OUTPUT_4() \
-DECLARE_OUTPUT_2() \
-float* output2 = output1 + outputStride; \
-float* output3 = output2 + outputStride;
-
-#define DECLARE_OUTPUT_5() \
-DECLARE_OUTPUT_4() \
-float* output4 = output3 + outputStride;
-
-#define DECLARE_OUTPUT_6() \
-DECLARE_OUTPUT_5() \
-float* output5 = output4 + outputStride;
-
-#define DECLARE_OUTPUT_7() \
-DECLARE_OUTPUT_6() \
-float* output6 = output5 + outputStride;
-
-#define DECLARE_OUTPUT_8() \
-DECLARE_OUTPUT_7() \
-float* output7 = output6 + outputStride;
-
-/* Outputs increment */
-#define INC_OUTPUT_1(x) \
-output0 += x;
-
-#define INC_OUTPUT_2(x) \
-INC_OUTPUT_1(x) \
-output1 += x;
-
-#define INC_OUTPUT_4(x) \
-INC_OUTPUT_2(x) \
-output2 += x; \
-output3 += x;
-
-#define INC_OUTPUT_5(x) \
-INC_OUTPUT_4(x) \
-output4 += x;
-
-#define INC_OUTPUT_6(x) \
-INC_OUTPUT_5(x) \
-output5 += x;
-
-#define INC_OUTPUT_7(x) \
-INC_OUTPUT_6(x) \
-output6 += x;
-
-#define INC_OUTPUT_8(x) \
-INC_OUTPUT_7(x) \
-output7 += x;
-
-/* Image declare */
-#define DECLARE_IMAGE_1(simd_type) \
-__ ## simd_type image0; \
-__ ## simd_type image1; \
-__ ## simd_type image2; \
-__ ## simd_type image3; \
-__ ## simd_type image4;
-
-#define DECLARE_IMAGE_2(simd_type) \
-DECLARE_IMAGE_1(simd_type) \
-__ ## simd_type image5;
-
-#define DECLARE_IMAGE_4(simd_type) \
-DECLARE_IMAGE_2(simd_type) \
-__ ## simd_type image6; \
-__ ## simd_type image7;
-
-#define DECLARE_IMAGE_5(simd_type) \
-DECLARE_IMAGE_4(simd_type) \
-__ ## simd_type image8;
-
-#define DECLARE_IMAGE_6(simd_type) \
-DECLARE_IMAGE_5(simd_type) \
-__ ## simd_type image9;
-
-#define DECLARE_IMAGE_7(simd_type) \
-DECLARE_IMAGE_6(simd_type) \
-__ ## simd_type imageA;
-
-#define DECLARE_IMAGE_8(simd_type) \
-DECLARE_IMAGE_7(simd_type) \
-__ ## simd_type imageB;
-
-/* Sums declare */
-#define DECLARE_SUM_1(simd_type) \
-__ ## simd_type sum0;
-
-#define DECLARE_SUM_2(simd_type) \
-DECLARE_SUM_1(simd_type) \
-__ ## simd_type sum1;
-
-#define DECLARE_SUM_4(simd_type) \
-DECLARE_SUM_2(simd_type) \
-__ ## simd_type sum2; \
-__ ## simd_type sum3;
-
-#define DECLARE_SUM_5(simd_type) \
-DECLARE_SUM_4(simd_type) \
-__ ## simd_type sum4;
-
-#define DECLARE_SUM_6(simd_type) \
-DECLARE_SUM_5(simd_type) \
-__ ## simd_type sum5;
-
-#define DECLARE_SUM_7(simd_type) \
-DECLARE_SUM_6(simd_type) \
-__ ## simd_type sum6;
-
-#define DECLARE_SUM_8(simd_type) \
-DECLARE_SUM_7(simd_type) \
-__ ## simd_type sum7;
-
-/* Sums load */
-#define LOAD_SUM_1(simd_type) \
-sum0 = _m ## simd_type ## _loadu_ps(output0);
-
-#define LOAD_SUM_2(simd_type) \
-LOAD_SUM_1(simd_type) \
-sum1 = _m ## simd_type ## _loadu_ps(output1);
-
-#define LOAD_SUM_4(simd_type) \
-LOAD_SUM_2(simd_type) \
-sum2 = _m ## simd_type ## _loadu_ps(output2); \
-sum3 = _m ## simd_type ## _loadu_ps(output3);
-
-#define LOAD_SUM_5(simd_type) \
-LOAD_SUM_4(simd_type) \
-sum4 = _m ## simd_type ## _loadu_ps(output4);
-
-#define LOAD_SUM_6(simd_type) \
-LOAD_SUM_5(simd_type) \
-sum5 = _m ## simd_type ## _loadu_ps(output5);
-
-#define LOAD_SUM_7(simd_type) \
-LOAD_SUM_6(simd_type) \
-sum6 = _m ## simd_type ## _loadu_ps(output6);
-
-#define LOAD_SUM_8(simd_type) \
-LOAD_SUM_7(simd_type) \
-sum7 = _m ## simd_type ## _loadu_ps(output7);
-
-/* Sums store */
-#define STORE_SUM_1(simd_type) \
-_m ## simd_type ## _storeu_ps(output0, sum0);
-
-#define STORE_SUM_2(simd_type) \
-STORE_SUM_1(simd_type) \
-_m ## simd_type ## _storeu_ps(output1, sum1);
-
-#define STORE_SUM_4(simd_type) \
-STORE_SUM_2(simd_type) \
-_m ## simd_type ## _storeu_ps(output2, sum2); \
-_m ## simd_type ## _storeu_ps(output3, sum3);
-
-#define STORE_SUM_5(simd_type) \
-STORE_SUM_4(simd_type) \
-_m ## simd_type ## _storeu_ps(output4, sum4);
-
-#define STORE_SUM_6(simd_type) \
-STORE_SUM_5(simd_type) \
-_m ## simd_type ## _storeu_ps(output5, sum5);
-
-#define STORE_SUM_7(simd_type) \
-STORE_SUM_6(simd_type) \
-_m ## simd_type ## _storeu_ps(output6, sum6);
-
-#define STORE_SUM_8(simd_type) \
-STORE_SUM_7(simd_type) \
-_m ## simd_type ## _storeu_ps(output7, sum7);
-
-/* Convolution */
-#define CONVOLVE_1ROWS(simd_type) \
-image0 = _m ## simd_type ## _loadu_ps(input0); \
-image1 = _m ## simd_type ## _loadu_ps(input1); \
-image2 = _m ## simd_type ## _loadu_ps(input2); \
-image3 = _m ## simd_type ## _loadu_ps(input3); \
-image4 = _m ## simd_type ## _loadu_ps(input4); \
-\
-sum0 = _m ## simd_type ## _add_ps(sum0, _m ## simd_type ## _mul_ps(weight0, image0)); \
-sum0 = _m ## simd_type ## _add_ps(sum0, _m ## simd_type ## _mul_ps(weight1, image1)); \
-sum0 = _m ## simd_type ## _add_ps(sum0, _m ## simd_type ## _mul_ps(weight2, image2)); \
-sum0 = _m ## simd_type ## _add_ps(sum0, _m ## simd_type ## _mul_ps(weight3, image3)); \
-sum0 = _m ## simd_type ## _add_ps(sum0, _m ## simd_type ## _mul_ps(weight4, image4));
-
-#define CONVOLVE_2ROWS(simd_type) \
-CONVOLVE_1ROWS(simd_type) \
-image5 = _m ## simd_type ## _loadu_ps(input5); \
-sum1 = _m ## simd_type ## _add_ps(sum1, _m ## simd_type ## _mul_ps(weight0, image1)); \
-sum1 = _m ## simd_type ## _add_ps(sum1, _m ## simd_type ## _mul_ps(weight1, image2)); \
-sum1 = _m ## simd_type ## _add_ps(sum1, _m ## simd_type ## _mul_ps(weight2, image3)); \
-sum1 = _m ## simd_type ## _add_ps(sum1, _m ## simd_type ## _mul_ps(weight3, image4)); \
-sum1 = _m ## simd_type ## _add_ps(sum1, _m ## simd_type ## _mul_ps(weight4, image5));
-
-#define CONVOLVE_4ROWS(simd_type) \
-CONVOLVE_2ROWS(simd_type) \
-image6 = _m ## simd_type ## _loadu_ps(input6); \
-sum2 = _m ## simd_type ## _add_ps(sum2, _m ## simd_type ## _mul_ps(weight0, image2)); \
-sum2 = _m ## simd_type ## _add_ps(sum2, _m ## simd_type ## _mul_ps(weight1, image3)); \
-sum2 = _m ## simd_type ## _add_ps(sum2, _m ## simd_type ## _mul_ps(weight2, image4)); \
-sum2 = _m ## simd_type ## _add_ps(sum2, _m ## simd_type ## _mul_ps(weight3, image5)); \
-sum2 = _m ## simd_type ## _add_ps(sum2, _m ## simd_type ## _mul_ps(weight4, image6)); \
-\
-image7 = _m ## simd_type ## _loadu_ps(input7); \
-sum3 = _m ## simd_type ## _add_ps(sum3, _m ## simd_type ## _mul_ps(weight0, image3)); \
-sum3 = _m ## simd_type ## _add_ps(sum3, _m ## simd_type ## _mul_ps(weight1, image4)); \
-sum3 = _m ## simd_type ## _add_ps(sum3, _m ## simd_type ## _mul_ps(weight2, image5)); \
-sum3 = _m ## simd_type ## _add_ps(sum3, _m ## simd_type ## _mul_ps(weight3, image6)); \
-sum3 = _m ## simd_type ## _add_ps(sum3, _m ## simd_type ## _mul_ps(weight4, image7));
-
-#define CONVOLVE_5ROWS(simd_type) \
-CONVOLVE_4ROWS(simd_type) \
-image8 = _m ## simd_type ## _loadu_ps(input8); \
-sum4 = _m ## simd_type ## _add_ps(sum4, _m ## simd_type ## _mul_ps(weight0, image4)); \
-sum4 = _m ## simd_type ## _add_ps(sum4, _m ## simd_type ## _mul_ps(weight1, image5)); \
-sum4 = _m ## simd_type ## _add_ps(sum4, _m ## simd_type ## _mul_ps(weight2, image6)); \
-sum4 = _m ## simd_type ## _add_ps(sum4, _m ## simd_type ## _mul_ps(weight3, image7)); \
-sum4 = _m ## simd_type ## _add_ps(sum4, _m ## simd_type ## _mul_ps(weight4, image8));
-
-#define CONVOLVE_6ROWS(simd_type) \
-CONVOLVE_5ROWS(simd_type) \
-image9 = _m ## simd_type ## _loadu_ps(input9); \
-sum5 = _m ## simd_type ## _add_ps(sum5, _m ## simd_type ## _mul_ps(weight0, image5)); \
-sum5 = _m ## simd_type ## _add_ps(sum5, _m ## simd_type ## _mul_ps(weight1, image6)); \
-sum5 = _m ## simd_type ## _add_ps(sum5, _m ## simd_type ## _mul_ps(weight2, image7)); \
-sum5 = _m ## simd_type ## _add_ps(sum5, _m ## simd_type ## _mul_ps(weight3, image8)); \
-sum5 = _m ## simd_type ## _add_ps(sum5, _m ## simd_type ## _mul_ps(weight4, image9));
-
-#define CONVOLVE_7ROWS(simd_type) \
-CONVOLVE_6ROWS(simd_type) \
-imageA = _m ## simd_type ## _loadu_ps(inputA); \
-sum6 = _m ## simd_type ## _add_ps(sum6, _m ## simd_type ## _mul_ps(weight0, image6)); \
-sum6 = _m ## simd_type ## _add_ps(sum6, _m ## simd_type ## _mul_ps(weight1, image7)); \
-sum6 = _m ## simd_type ## _add_ps(sum6, _m ## simd_type ## _mul_ps(weight2, image8)); \
-sum6 = _m ## simd_type ## _add_ps(sum6, _m ## simd_type ## _mul_ps(weight3, image9)); \
-sum6 = _m ## simd_type ## _add_ps(sum6, _m ## simd_type ## _mul_ps(weight4, imageA));
-
-#define CONVOLVE_8ROWS(simd_type) \
-CONVOLVE_7ROWS(simd_type) \
-imageB = _m ## simd_type ## _loadu_ps(inputB); \
-sum7 = _m ## simd_type ## _add_ps(sum7, _m ## simd_type ## _mul_ps(weight0, image7)); \
-sum7 = _m ## simd_type ## _add_ps(sum7, _m ## simd_type ## _mul_ps(weight1, image8)); \
-sum7 = _m ## simd_type ## _add_ps(sum7, _m ## simd_type ## _mul_ps(weight2, image9)); \
-sum7 = _m ## simd_type ## _add_ps(sum7, _m ## simd_type ## _mul_ps(weight3, imageA)); \
-sum7 = _m ## simd_type ## _add_ps(sum7, _m ## simd_type ## _mul_ps(weight4, imageB));
-
-/* Convolution MEGA macro */
-#define DECLARE_SUMX(rows) DECLARE_SUM_ ## rows
-#define LOAD_SUMX(rows) LOAD_SUM_ ## rows
-#define DECLARE_INPUTX(rows) DECLARE_INPUT_ ## rows
-#define DECLARE_IMAGEX(rows) DECLARE_IMAGE_ ## rows
-#define CONVOLVEX(rows) CONVOLVE_ ## rows ## ROWS
-#define INC_INPUTX(rows) INC_INPUT_ ## rows
-#define STORE_SUMX(rows) STORE_SUM_ ## rows
-#define INC_OUTPUTX(rows) INC_OUTPUT_ ## rows
-
-#define CONVOLUTION_LOOP(rows, simd_type, simd_inst_prefex, simd_set, i) \
-DECLARE_SUMX(rows)(simd_type) \
-LOAD_SUMX(rows)(simd_inst_prefex) \
-DECLARE_WEIGHTS(simd_type) \
-DECLARE_INPUT_0(i) \
-DECLARE_INPUTX(rows)() \
-DECLARE_IMAGEX(rows)(simd_type) \
-\
-LOAD_WEIGHTS(0, simd_inst_prefex, simd_set) \
-CONVOLVEX(rows)(simd_inst_prefex) \
-INC_INPUTX(rows)() \
-\
-LOAD_WEIGHTS(1, simd_inst_prefex, simd_set) \
-CONVOLVEX(rows)(simd_inst_prefex) \
-INC_INPUTX(rows)() \
-\
-LOAD_WEIGHTS(2, simd_inst_prefex, simd_set) \
-CONVOLVEX(rows)(simd_inst_prefex) \
-INC_INPUTX(rows)() \
-\
-LOAD_WEIGHTS(3, simd_inst_prefex, simd_set) \
-CONVOLVEX(rows)(simd_inst_prefex) \
-INC_INPUTX(rows)() \
-\
-LOAD_WEIGHTS(4, simd_inst_prefex, simd_set) \
-CONVOLVEX(rows)(simd_inst_prefex) \
-\
-STORE_SUMX(rows)(simd_inst_prefex) \
-\
-INC_OUTPUTX(rows)(sizeof(__ ## simd_type) / sizeof(float))
-
-
-#define CONVOLVE_8COLS_XROWS(rows, i) \
-{ \
-CONVOLUTION_LOOP(rows, m256, m256, _set1_ps, i) \
-}
-
-#define CONVOLVE_4COLS_XROWS(rows, i) \
-{ \
-CONVOLUTION_LOOP(rows, m128, m, _set_ps1, i) \
-}
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/simd/convolve.c b/contrib/lua-torch/torch7/lib/TH/generic/simd/convolve.c
deleted file mode 100644
index da7a4bb20..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/simd/convolve.c
+++ /dev/null
@@ -1,127 +0,0 @@
-#if defined(USE_AVX) && defined(__AVX__)
-
-#ifdef _MSC_VER
-#include <intrin.h>
-
-static __inline int __get_cpuid (unsigned int __level, unsigned int *__eax,
-                                 unsigned int *__ebx, unsigned int *__ecx,
-                                 unsigned int *__edx) {
-  unsigned int cpui[4];
-  __cpuid(cpui, __level);
-  *__eax = cpui[0]; *__ebx = cpui[1]; *__ecx = cpui[2]; *__edx = cpui[3];
-  return 1;
-}
-
-static void xgetbv(unsigned int op, unsigned int* eax, unsigned int* edx) {
-  *eax = 0; *edx = 0;
-  if (op == 0)
-      *eax = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
-}
-
-#else
-
-#if __i386__
-#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
-__asm("  pushl  %%ebx\n" \
-"  cpuid\n" \
-"  mov    %%ebx,%1\n" \
-"  popl   %%ebx" \
-: "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
-: "0"(__level))
-#else
-#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
-__asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
-: "0"(__level))
-#endif
-
-static __inline int __get_cpuid (unsigned int __level, unsigned int *__eax,
-                                 unsigned int *__ebx, unsigned int *__ecx,
-                                 unsigned int *__edx) {
-  __cpuid(__level, *__eax, *__ebx, *__ecx, *__edx);
-  return 1;
-}
-
-static void xgetbv(unsigned int op, unsigned int* eax, unsigned int* edx) {
-  __asm__ __volatile__
-  (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
-}
-
-#endif
-
-enum ECPUFeature
-{
-  kCPUFeature_SSE = 0x01,
-  kCPUFeature_SSE2 = 0x02,
-  kCPUFeature_SSE3 = 0x04,
-  kCPUFeature_SSE3_S = 0x08,
-  kCPUFeature_SSE4_1 = 0x10,
-  kCPUFeature_SSE4_2 = 0x20,
-  kCPUFeature_AVX = 0x40
-};
-
-static unsigned int checkCPUFeatures() {
-  unsigned int eax = 0, ebx = 0, ecx = 0, edx = 0;
-  unsigned int features = 0;
-  __get_cpuid(1, &eax, &ebx, &ecx, &edx);
-  if( (edx & (1 << 25)) != 0 ) {
-    features |= kCPUFeature_SSE;
-  }
-  if( (edx & (1 << 26)) != 0 ) {
-    features |= kCPUFeature_SSE2;
-  }
-  if( (ecx & (1 << 0)) != 0 ) {
-    features |= kCPUFeature_SSE3;
-  }
-  if( (ecx & (1 << 9)) != 0 ) {
-    features |= kCPUFeature_SSE3_S;
-  }
-  if( (ecx & (1 << 19)) != 0 ) {
-    features |= kCPUFeature_SSE4_1;
-  }
-  if( (ecx & (1 << 20)) != 0 ) {
-    features |= kCPUFeature_SSE4_2;
-  }
-  if( (ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0 ) {
-    xgetbv(0, &eax, &edx);
-    if( (eax & 6) == 6 ) {
-      features |= kCPUFeature_AVX;
-    }
-  }
-  return features;
-}
-
-#include <stdio.h>
-
-static int haveCPUFeature(unsigned int feature) {
-  static unsigned int sCPUFeatures = 0;
-  static int sDetectedCPUFeatures = 0;
-  if (!sDetectedCPUFeatures) {
-    sDetectedCPUFeatures = 1;
-    sCPUFeatures = checkCPUFeatures();
-    if ((sCPUFeatures & kCPUFeature_AVX) != 0) {
-      printf("torch running avx\n");
-    } else {
-      printf("torch running sse \n");
-    }
-  }
-  return (sCPUFeatures & feature) != 0;
-}
-
-#endif
-
-void convolve_5x5_sse(float* output, float* input, float* kernel, long outRows, long outCols, long outStride, long inCols);
-void convolve_5x5_avx(float* output, float* input, float* kernel, long outRows, long outCols, long outStride, long inCols);
-
-void convolve_5x5(float* output, float* input, float* kernel, long outRows, long outCols, long inCols) {
-#if defined(USE_AVX) && defined(__AVX__)
-  int avx = haveCPUFeature(kCPUFeature_AVX);
-  if (avx)
-  {
-    convolve_5x5_avx(output, input, kernel, outRows, outCols, outCols, inCols);
-  }
-  else
-#endif
-  {
-    convolve_5x5_sse(output, input, kernel, outRows, outCols, outCols, inCols);
-  }
-}
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/simd/convolve.h b/contrib/lua-torch/torch7/lib/TH/generic/simd/convolve.h
deleted file mode 100644
index 7b9b04c50..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/simd/convolve.h
+++ /dev/null
@@ -1 +0,0 @@
-void convolve_5x5(float* output, float* input, float* kernel, long outRows, long outCols, long inCols);
-\ No newline at end of file
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/simd/convolve5x5_avx.c b/contrib/lua-torch/torch7/lib/TH/generic/simd/convolve5x5_avx.c
deleted file mode 100644
index 52b6d0ffb..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/simd/convolve5x5_avx.c
+++ /dev/null
@@ -1,212 +0,0 @@
-#include <immintrin.h>
-#include "common_simd.h"
-
-#define CLEAR_AVX() _mm256_zeroupper()
-
-void convolve_5x5_1_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  long i = 0;
-  long alignedCount = count & 0xFFFFFFF8;
-  DECLARE_OUTPUT_1()
-  for (; i < alignedCount; i+=8) {
-    CONVOLVE_8COLS_XROWS(1, i)
-  }
-}
-
-void convolve_5x5_2_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  long i = 0;
-  long alignedCount = count & 0xFFFFFFF8;
-  DECLARE_OUTPUT_2()
-  for (; i < alignedCount; i+=8) {
-    CONVOLVE_8COLS_XROWS(2, i)
-  }
-}
-
-void convolve_5x5_4_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  long i = 0;
-  long alignedCount = count & 0xFFFFFFF8;
-  DECLARE_OUTPUT_4()
-  for (; i < alignedCount; i+=8) {
-    CONVOLVE_8COLS_XROWS(4, i)
-  }
-}
-
-void convolve_5x5_5_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  long i = 0;
-  long alignedCount = count & 0xFFFFFFF8;
-  DECLARE_OUTPUT_5()
-  for (; i < alignedCount; i+=8) {
-    CONVOLVE_8COLS_XROWS(5, i)
-  }
-}
-
-void convolve_5x5_6_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  long i = 0;
-  long alignedCount = count & 0xFFFFFFF8;
-  DECLARE_OUTPUT_6()
-  for (; i < alignedCount; i+=8) {
-    CONVOLVE_8COLS_XROWS(6, i)
-  }
-}
-
-void convolve_5x5_7_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  long i = 0;
-  long alignedCount = count & 0xFFFFFFF8;
-  DECLARE_OUTPUT_7()
-  for (; i < alignedCount; i+=8) {
-    CONVOLVE_8COLS_XROWS(7, i)
-  }
-}
-
-void convolve_5x5_8_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  long i = 0;
-  long alignedCount = count & 0xFFFFFFF8;
-  DECLARE_OUTPUT_8()
-  for (; i < alignedCount; i+=8) {
-    CONVOLVE_8COLS_XROWS(8, i)
-  }
-}
-
-void convolve_5x5_64x64_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  for(int i = 0; i < 60; i+=6)
-  {
-    DECLARE_OUTPUT_6()
-    CONVOLVE_8COLS_XROWS(6, 0)
-    CONVOLVE_8COLS_XROWS(6, 8)
-    CONVOLVE_8COLS_XROWS(6, 16)
-    CONVOLVE_8COLS_XROWS(6, 24)
-    CONVOLVE_8COLS_XROWS(6, 32)
-    CONVOLVE_8COLS_XROWS(6, 40)
-    CONVOLVE_8COLS_XROWS(6, 48)
-    CONVOLVE_8COLS_XROWS(6, 56)
-    output += outputStride * 6;
-    image += inputStride * 6;
-  }
-  DECLARE_OUTPUT_4()
-  CONVOLVE_8COLS_XROWS(4, 0)
-  CONVOLVE_8COLS_XROWS(4, 8)
-  CONVOLVE_8COLS_XROWS(4, 16)
-  CONVOLVE_8COLS_XROWS(4, 24)
-  CONVOLVE_8COLS_XROWS(4, 32)
-  CONVOLVE_8COLS_XROWS(4, 40)
-  CONVOLVE_8COLS_XROWS(4, 48)
-  CONVOLVE_8COLS_XROWS(4, 56)
-}
-
-void convolve_5x5_32x32_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  for(int i = 0; i < 30; i+=6)
-  {
-    DECLARE_OUTPUT_6()
-    CONVOLVE_8COLS_XROWS(6, 0)
-    CONVOLVE_8COLS_XROWS(6, 8)
-    CONVOLVE_8COLS_XROWS(6, 16)
-    CONVOLVE_8COLS_XROWS(6, 24)
-    output += outputStride * 6;
-    image += inputStride * 6;
-  }
-  DECLARE_OUTPUT_2()
-  CONVOLVE_8COLS_XROWS(2, 0)
-  CONVOLVE_8COLS_XROWS(2, 8)
-  CONVOLVE_8COLS_XROWS(2, 16)
-  CONVOLVE_8COLS_XROWS(2, 24)
-}
-
-void convolve_5x5_16x16_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  for(int i = 0; i < 12; i+=6)
-  {
-    DECLARE_OUTPUT_6()
-    CONVOLVE_8COLS_XROWS(6, 0)
-    CONVOLVE_8COLS_XROWS(6, 8)
-    output += outputStride * 6;
-    image += inputStride * 6;
-  }
-  DECLARE_OUTPUT_4()
-  CONVOLVE_8COLS_XROWS(4, 0)
-  CONVOLVE_8COLS_XROWS(4, 8)
-}
-
-void convolve_5x5_8x8_avx(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  DECLARE_OUTPUT_8()
-  CONVOLVE_8COLS_XROWS(8, 0)
-}
-
-void convolve_5x5_sse(float* output, float* input, float* kernel, long outRows, long outCols, long outStride, long inCols);
-
-void convolve_5x5_avx(float* output, float* input, float* kernel, long outRows, long outCols, long outStride, long inCols) {
-  long ic = inCols;
-  long yy = 0;
-  float* t_ = input;
-  float* r_ = output;
-  float* k_ = kernel;
-
-  if((outRows == 64) && (outCols == 64)) {
-    convolve_5x5_64x64_avx(output, input, kernel, outRows, outStride, inCols);
-    return;
-  }
-
-  if((outRows == 32) && (outCols == 32)) {
-    convolve_5x5_32x32_avx(output, input, kernel, outRows, outStride, inCols);
-    return;
-  }
-
-  if((outRows == 16) && (outCols == 16)) {
-    convolve_5x5_16x16_avx(output, input, kernel, outRows, outStride, inCols);
-    return;
-  }
-
-  if((outRows == 8) && (outCols == 8)) {
-    convolve_5x5_8x8_avx(output, input, kernel, outRows, outStride, inCols);
-    return;
-  }
-
-  for(; yy < (outRows / 6 ) * 6; yy += 6) {
-    float *pi_ = t_ + yy*ic;
-    float *pw_ = k_;
-    float *pis_ = pi_;
-    convolve_5x5_6_avx(r_, pis_, pw_, outCols, outStride, ic);
-    r_ += (outStride * 6);
-  }
-
-  // more than 2 rows left to process and we ended up on a non-multiple of 4
-  if((yy < (outRows & 0xFFFFFFFE)) && ((yy % 4) != 0)) {
-    // process 2 rows to align on the next multiple of 4 rows (because we were a multiple of 6 after the previous loop)
-    float *pi_ = t_ + yy*ic;
-    float *pw_ = k_;
-    float *pis_ = pi_;
-    convolve_5x5_2_avx(r_, pis_, pw_, outCols, outStride, ic);
-    r_ += (outStride * 2);
-    yy += 2;
-  }
-
-  for(; yy < (outRows & 0xFFFFFFFC); yy += 4) {
-    float *pi_ = t_ + yy*ic;
-    float *pw_ = k_;
-    float *pis_ = pi_;
-    convolve_5x5_4_avx(r_, pis_, pw_, outCols, outStride, ic);
-    r_ += (outStride * 4);
-  }
-
-  for(; yy < (outRows & 0xFFFFFFFE); yy += 2) {
-    float *pi_ = t_ + yy*ic;
-    float *pw_ = k_;
-    float *pis_ = pi_;
-    convolve_5x5_2_avx(r_, pis_, pw_, outCols, outStride, ic);
-    r_ += (outStride * 2);
-  }
-
-  for(; yy < outRows; yy += 1) {
-    float *pi_ = t_ + yy*ic;
-    float *pw_ = k_;
-    float *pis_ = pi_;
-    convolve_5x5_1_avx(r_, pis_, pw_, outCols, outStride, ic);
-    r_ += (outStride * 1);
-  }
-
-  long procCols = outCols & 0xFFFFFFF8; // avx version processes 8 cols at a time
-  long remCols = outCols - procCols;
-
-  //process the rest using sse
-  if( remCols > 0) {
-    CLEAR_AVX();
-    convolve_5x5_sse(&output[procCols], &input[procCols], kernel, outRows, remCols, outStride, inCols);
-  }
-}
-\ No newline at end of file
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/simd/convolve5x5_sse.c b/contrib/lua-torch/torch7/lib/TH/generic/simd/convolve5x5_sse.c
deleted file mode 100644
index f34b79695..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/simd/convolve5x5_sse.c
+++ /dev/null
@@ -1,320 +0,0 @@
-#include <emmintrin.h>
-#include "common_simd.h"
-
-
-/* SSE variants */
-void convolve_5x5_1_sse(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  long i = 0;
-  long alignedCount4 = count & 0xFFFFFFFC;
-  DECLARE_OUTPUT_1()
-  for (; i < alignedCount4; i+=4) {
-    CONVOLVE_4COLS_XROWS(1, i)
-  }
-  for (; i < (count); i++) {
-    float output0 = output[i + outputStride * 0];
-    int row;
-    for (row = 0; row < 5; row++) {
-      int col;
-      for (col = 0; col < 5; col++) {
-        output0 += weight[5 * row + col] * image[i + (row + 0) * inputStride + col];
-      }
-    }
-    output[i + outputStride * 0] = output0;
-  }
-}
-
-void convolve_5x5_2_sse(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  long i = 0;
-  long alignedCount4 = count & 0xFFFFFFFC;
-  DECLARE_OUTPUT_2()
-  for (; i < alignedCount4; i+=4) {
-    CONVOLVE_4COLS_XROWS(2, i)
-  }
-  for (; i < (count); i++) {
-    float output0 = output[i + outputStride * 0];
-    float output1 = output[i + outputStride * 1];
-    int row;
-    for (row = 0; row < 5; row++) {
-      int col;
-      for (col = 0; col < 5; col++) {
-        output0 += weight[5 * row + col] * image[i + (row + 0) * inputStride + col];
-        output1 += weight[5 * row + col] * image[i + (row + 1) * inputStride + col];
-      }
-    }
-    output[i + outputStride * 0] = output0;
-    output[i + outputStride * 1] = output1;
-  }
-}
-
-void convolve_5x5_4_sse(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  long i = 0;
-  long alignedCount4 = count & 0xFFFFFFFC;
-  DECLARE_OUTPUT_4()
-  for (; i < alignedCount4; i+=4) {
-    CONVOLVE_4COLS_XROWS(4, i)
-  }
-  for (; i < (count); i++) {
-    float output0 = output[i + outputStride * 0];
-    float output1 = output[i + outputStride * 1];
-    float output2 = output[i + outputStride * 2];
-    float output3 = output[i + outputStride * 3];
-    int row;
-    for (row = 0; row < 5; row++) {
-      int col;
-      for (col = 0; col < 5; col++) {
-        output0 += weight[5 * row + col] * image[i + (row + 0) * inputStride + col];
-        output1 += weight[5 * row + col] * image[i + (row + 1) * inputStride + col];
-        output2 += weight[5 * row + col] * image[i + (row + 2) * inputStride + col];
-        output3 += weight[5 * row + col] * image[i + (row + 3) * inputStride + col];
-      }
-    }
-    output[i + outputStride * 0] = output0;
-    output[i + outputStride * 1] = output1;
-    output[i + outputStride * 2] = output2;
-    output[i + outputStride * 3] = output3;
-  }
-}
-
-void convolve_5x5_6_sse(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  long i = 0;
-  long alignedCount4 = count & 0xFFFFFFFC;
-  DECLARE_OUTPUT_6()
-  for (; i < alignedCount4; i+=4) {
-    CONVOLVE_4COLS_XROWS(6, i)
-  }
-  for (; i<(count); i++) {
-    float output0 = output[i + outputStride * 0];
-    float output1 = output[i + outputStride * 1];
-    float output2 = output[i + outputStride * 2];
-    float output3 = output[i + outputStride * 3];
-    float output4 = output[i + outputStride * 4];
-    float output5 = output[i + outputStride * 5];
-    int row;
-    for (row = 0; row < 5; row++) {
-      int col;
-      for (col = 0; col < 5; col++) {
-        output0 += weight[5 * row + col] * image[i + (row + 0) * inputStride + col];
-        output1 += weight[5 * row + col] * image[i + (row + 1) * inputStride + col];
-        output2 += weight[5 * row + col] * image[i + (row + 2) * inputStride + col];
-        output3 += weight[5 * row + col] * image[i + (row + 3) * inputStride + col];
-        output4 += weight[5 * row + col] * image[i + (row + 4) * inputStride + col];
-        output5 += weight[5 * row + col] * image[i + (row + 5) * inputStride + col];
-      }
-    }
-    output[i + outputStride * 0] = output0;
-    output[i + outputStride * 1] = output1;
-    output[i + outputStride * 2] = output2;
-    output[i + outputStride * 3] = output3;
-    output[i + outputStride * 4] = output4;
-    output[i + outputStride * 5] = output5;
-  }
-}
-
-void convolve_5x5_8_sse(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  long i = 0;
-  long alignedCount4 = count & 0xFFFFFFFC;
-  DECLARE_OUTPUT_8()
-  for (; i < alignedCount4; i+=4) {
-    CONVOLVE_4COLS_XROWS(8, i)
-  }
-  for (; i<(count); i++) {
-    float output0 = output[i + outputStride * 0];
-    float output1 = output[i + outputStride * 1];
-    float output2 = output[i + outputStride * 2];
-    float output3 = output[i + outputStride * 3];
-    float output4 = output[i + outputStride * 4];
-    float output5 = output[i + outputStride * 5];
-    float output6 = output[i + outputStride * 6];
-    float output7 = output[i + outputStride * 7];
-    int row;
-    for (row = 0; row < 5; row++) {
-      int col;
-      for (col = 0; col < 5; col++) {
-        output0 += weight[5 * row + col] * image[i + (row + 0) * inputStride + col];
-        output1 += weight[5 * row + col] * image[i + (row + 1) * inputStride + col];
-        output2 += weight[5 * row + col] * image[i + (row + 2) * inputStride + col];
-        output3 += weight[5 * row + col] * image[i + (row + 3) * inputStride + col];
-        output4 += weight[5 * row + col] * image[i + (row + 4) * inputStride + col];
-        output5 += weight[5 * row + col] * image[i + (row + 5) * inputStride + col];
-        output6 += weight[5 * row + col] * image[i + (row + 6) * inputStride + col];
-        output7 += weight[5 * row + col] * image[i + (row + 7) * inputStride + col];
-      }
-    }
-    output[i + outputStride * 0] = output0;
-    output[i + outputStride * 1] = output1;
-    output[i + outputStride * 2] = output2;
-    output[i + outputStride * 3] = output3;
-    output[i + outputStride * 4] = output4;
-    output[i + outputStride * 5] = output5;
-    output[i + outputStride * 6] = output6;
-    output[i + outputStride * 7] = output7;
-  }
-}
-
-#define UNROLL_SSE_CONVOLUTION 0
-#if (UNROLL_SSE_CONVOLUTION)
-
-void convolve_5x5_64x64_sse(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  for(int i = 0; i < 60; i+=6)
-  {
-    DECLARE_OUTPUT_6()
-    CONVOLVE_4COLS_XROWS(6, 0)
-    CONVOLVE_4COLS_XROWS(6, 4)
-    CONVOLVE_4COLS_XROWS(6, 8)
-    CONVOLVE_4COLS_XROWS(6, 12)
-    CONVOLVE_4COLS_XROWS(6, 16)
-    CONVOLVE_4COLS_XROWS(6, 20)
-    CONVOLVE_4COLS_XROWS(6, 24)
-    CONVOLVE_4COLS_XROWS(6, 28)
-    CONVOLVE_4COLS_XROWS(6, 32)
-    CONVOLVE_4COLS_XROWS(6, 36)
-    CONVOLVE_4COLS_XROWS(6, 40)
-    CONVOLVE_4COLS_XROWS(6, 44)
-    CONVOLVE_4COLS_XROWS(6, 48)
-    CONVOLVE_4COLS_XROWS(6, 52)
-    CONVOLVE_4COLS_XROWS(6, 56)
-    CONVOLVE_4COLS_XROWS(6, 60)
-    output += outputStride * 6;
-    image += inputStride * 6;
-  }
-  DECLARE_OUTPUT_4()
-  CONVOLVE_4COLS_XROWS(4, 0)
-  CONVOLVE_4COLS_XROWS(4, 4)
-  CONVOLVE_4COLS_XROWS(4, 8)
-  CONVOLVE_4COLS_XROWS(4, 12)
-  CONVOLVE_4COLS_XROWS(4, 16)
-  CONVOLVE_4COLS_XROWS(4, 20)
-  CONVOLVE_4COLS_XROWS(4, 24)
-  CONVOLVE_4COLS_XROWS(4, 28)
-  CONVOLVE_4COLS_XROWS(4, 32)
-  CONVOLVE_4COLS_XROWS(4, 36)
-  CONVOLVE_4COLS_XROWS(4, 40)
-  CONVOLVE_4COLS_XROWS(4, 44)
-  CONVOLVE_4COLS_XROWS(4, 48)
-  CONVOLVE_4COLS_XROWS(4, 52)
-  CONVOLVE_4COLS_XROWS(4, 56)
-  CONVOLVE_4COLS_XROWS(4, 60)
-}
-
-void convolve_5x5_32x32_sse(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  for(int i = 0; i < 30; i+=6)
-  {
-    DECLARE_OUTPUT_6()
-
-      CONVOLVE_4COLS_XROWS(6, 0)
-      CONVOLVE_4COLS_XROWS(6, 4)
-      CONVOLVE_4COLS_XROWS(6, 8)
-      CONVOLVE_4COLS_XROWS(6, 12)
-      CONVOLVE_4COLS_XROWS(6, 16)
-      CONVOLVE_4COLS_XROWS(6, 20)
-      CONVOLVE_4COLS_XROWS(6, 24)
-      CONVOLVE_4COLS_XROWS(6, 28)
-
-    output += outputStride * 6;
-    image += inputStride * 6;
-  }
-  DECLARE_OUTPUT_2()
-  CONVOLVE_4COLS_XROWS(2, 0)
-  CONVOLVE_4COLS_XROWS(2, 4)
-  CONVOLVE_4COLS_XROWS(2, 8)
-  CONVOLVE_4COLS_XROWS(2, 12)
-  CONVOLVE_4COLS_XROWS(2, 16)
-  CONVOLVE_4COLS_XROWS(2, 20)
-  CONVOLVE_4COLS_XROWS(2, 24)
-  CONVOLVE_4COLS_XROWS(2, 28)
-}
-
-void convolve_5x5_16x16_sse(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  for(int i = 0; i < 12; i+=6)
-  {
-    DECLARE_OUTPUT_6()
-    CONVOLVE_4COLS_XROWS(6, 0)
-    CONVOLVE_4COLS_XROWS(6, 4)
-    CONVOLVE_4COLS_XROWS(6, 8)
-    CONVOLVE_4COLS_XROWS(6, 12)
-    output += outputStride * 6;
-    image += inputStride * 6;
-  }
-  DECLARE_OUTPUT_4()
-  CONVOLVE_4COLS_XROWS(4, 0)
-  CONVOLVE_4COLS_XROWS(4, 4)
-  CONVOLVE_4COLS_XROWS(4, 8)
-  CONVOLVE_4COLS_XROWS(4, 12)
-}
-
-void convolve_5x5_8x8_sse(float* output, float* image, float* weight, long count, long outputStride, long inputStride) {
-  DECLARE_OUTPUT_8()
-  CONVOLVE_4COLS_XROWS(8, 0)
-  CONVOLVE_4COLS_XROWS(8, 4)
-}
-
-#endif
-
-void convolve_5x5_sse(float* output, float* input, float* kernel, long outRows, long outCols, long outStride, long inCols) {
-  long yy = 0;
-  float* t_ = input;
-  float* r_ = output;
-  float* k_ = kernel;
-#if (UNROLL_SSE_CONVOLUTION)
-  if((outRows == 64) && (outCols == 64)) {
-    convolve_5x5_64x64_sse(output, input, kernel, outRows, outStride, inCols);
-    return;
-  }
-
-  if((outRows == 32) && (outCols == 32)) {
-    convolve_5x5_32x32_sse(output, input, kernel, outRows, outStride, inCols);
-    return;
-  }
-
-  if((outRows == 16) && (outCols == 16)) {
-    convolve_5x5_16x16_sse(output, input, kernel, outRows, outStride, inCols);
-    return;
-  }
-
-  if((outRows == 8) && (outCols == 8)) {
-    convolve_5x5_8x8_sse(output, input, kernel, outRows, outStride, inCols);
-    return;
-  }
-#endif
-  for(; yy < (outRows / 6 ) * 6; yy += 6) {
-    float *pi_ = t_ + yy*inCols;
-    float *pw_ = k_;
-    float *pis_ = pi_;
-    convolve_5x5_6_sse(r_, pis_, pw_, outCols, outStride, inCols);
-    r_ += (outStride * 6);
-  }
-  // more than 2 rows left to process and we ended up on a non-multiple of 4
-  if((yy < (outRows & 0xFFFFFFFE)) && ((yy % 4) != 0)) {
-    // process 2 rows to align on the next multiple of 4 rows (because we were a multiple of 6 after the previous loop)
-    float *pi_ = t_ + yy*inCols;
-    float *pw_ = k_;
-    float *pis_ = pi_;
-    convolve_5x5_2_sse(r_, pis_, pw_, outCols, outStride, inCols);
-    r_ += (outStride * 2);
-    yy += 2;
-  }
-
-  for(; yy < (outRows & 0xFFFFFFFC); yy += 4) {
-    float *pi_ = t_ + yy*inCols;
-    float *pw_ = k_;
-    float *pis_ = pi_;
-    convolve_5x5_4_sse(r_, pis_, pw_, outCols, outStride, inCols);
-    r_ += (outStride * 4);
-  }
-
-  for(; yy < (outRows & 0xFFFFFFFE); yy += 2) {
-    float *pi_ = t_ + yy*inCols;
-    float *pw_ = k_;
-    float *pis_ = pi_;
-    convolve_5x5_2_sse(r_, pis_, pw_, outCols, outStride, inCols);
-    r_ += (outStride * 2);
-  }
-
-  for(; yy < outRows; yy += 1) {
-    float *pi_ = t_ + yy*inCols;
-    float *pw_ = k_;
-    float *pis_ = pi_;
-    convolve_5x5_1_sse(r_, pis_, pw_, outCols, outStride, inCols);
-    r_ += (outStride * 1);
-  }
-}
diff --git a/contrib/lua-torch/torch7/lib/TH/generic/simd/simd.h b/contrib/lua-torch/torch7/lib/TH/generic/simd/simd.h
deleted file mode 100644
index b1878ad5b..000000000
--- a/contrib/lua-torch/torch7/lib/TH/generic/simd/simd.h
+++ /dev/null
@@ -1,165 +0,0 @@
-#ifndef TH_SIMD_INC
-#define TH_SIMD_INC
-
-#include <stdint.h>
-#include <stdlib.h>
-#if defined(_MSC_VER)
-#include <intrin.h>
-#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
-#include <cpuid.h>
-#endif
-
-// Can be found on Intel ISA Reference for CPUID
-#define CPUID_AVX2_BIT 0x20       // Bit 5 of EBX for EAX=0x7
-#define CPUID_AVX_BIT  0x10000000 // Bit 28 of ECX for EAX=0x1
-#define CPUID_SSE_BIT  0x2000000  // bit 25 of EDX for EAX=0x1
-
-// Helper macros for initialization
-#define FUNCTION_IMPL(NAME, EXT) \
-    { .function=(void *)NAME,    \
-      .supportedSimdExt=EXT      \
-    }
-
-#define INIT_DISPATCH_PTR(OP)    \
-  do {                           \
-    int i;                       \
-    for (i = 0; i < sizeof(THVector_(OP ## _DISPATCHTABLE)) / sizeof(FunctionDescription); ++i) { \
-      THVector_(OP ## _DISPATCHPTR) = THVector_(OP ## _DISPATCHTABLE)[i].function;                     \
-      if (THVector_(OP ## _DISPATCHTABLE)[i].supportedSimdExt & hostSimdExts) {                       \
-        break;                                                                                     \
-      }                                                                                            \
-    }                                                                                              \
-  } while(0)
-
-
-typedef struct FunctionDescription
-{
-  void *function;
-  uint32_t supportedSimdExt;
-} FunctionDescription;
-
-
-enum SIMDExtensions
-{
-#if defined(__NEON__)
-  SIMDExtension_NEON    = 0x1,
-#elif defined(__PPC64__)
-  SIMDExtension_VSX     = 0x1,
-#else
-  SIMDExtension_AVX2    = 0x1,
-  SIMDExtension_AVX     = 0x2,
-  SIMDExtension_SSE     = 0x4,
-#endif
-  SIMDExtension_DEFAULT = 0x0
-};
-
-
-#if defined(__arm__) || defined(__aarch64__) // incl. armel, armhf, arm64
-
- #if defined(__NEON__)
-
-static inline uint32_t detectHostSIMDExtensions()
-{
-  return SIMDExtension_NEON;
-}
-
- #else //ARM without NEON
-
-static inline uint32_t detectHostSIMDExtensions()
-{
-  return SIMDExtension_DEFAULT;
-}
-
- #endif
-
-#elif defined(__PPC64__)
-
- #if defined(__VSX__)
-
-static inline uint32_t detectHostSIMDExtensions()
-{
-  uint32_t hostSimdExts = SIMDExtension_DEFAULT;
-  char *evar;
-
-  evar = getenv("TH_NO_VSX");
-  if (evar == NULL || strncmp(evar, "1", 2) != 0)
-    hostSimdExts = SIMDExtension_VSX;
-  return hostSimdExts;
-}
-
- #else //PPC64 without VSX
-
-static inline uint32_t detectHostSIMDExtensions()
-{
-  return SIMDExtension_DEFAULT;
-}
-
- #endif
-
-#else   // x86
-static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
-{
-#if defined(_MSC_VER)
-  uint32_t cpuInfo[4];
-  __cpuid(cpuInfo, *eax);
-  *eax = cpuInfo[0];
-  *ebx = cpuInfo[1];
-  *ecx = cpuInfo[2];
-  *edx = cpuInfo[3];
-#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
-  uint32_t level = *eax;
-  __get_cpuid (level, eax, ebx, ecx, edx);
-#else
-  uint32_t a = *eax, b, c = *ecx, d;
-  __asm volatile ( "cpuid\n\t"
-		 : "+a"(a), "=b"(b), "+c"(c), "=d"(d) );
-  *eax = a;
-  *ebx = b;
-  *ecx = c;
-  *edx = d;
-#endif
-}
-
-static inline uint32_t detectHostSIMDExtensions()
-{
-  uint32_t eax, ebx, ecx, edx;
-  uint32_t hostSimdExts = 0x0;
-  int TH_NO_AVX = 1, TH_NO_AVX2 = 1, TH_NO_SSE = 1;
-  char *evar;
-
-  evar = getenv("TH_NO_AVX2");
-  if (evar == NULL || strncmp(evar, "1", 2) != 0)
-    TH_NO_AVX2 = 0;
-
-  // Check for AVX2. Requires separate CPUID
-  eax = 0x7;
-  ecx = 0x0;
-  cpuid(&eax, &ebx, &ecx, &edx);
-  if ((ebx & CPUID_AVX2_BIT) && TH_NO_AVX2 == 0) {
-    hostSimdExts |= SIMDExtension_AVX2;
-  }
-
-  // Detect and enable AVX and SSE
-  eax = 0x1;
-  cpuid(&eax, &ebx, &ecx, &edx);
-
-  evar = getenv("TH_NO_AVX");
-  if (evar == NULL || strncmp(evar, "1", 2) != 0)
-    TH_NO_AVX = 0;
-  if (ecx & CPUID_AVX_BIT && TH_NO_AVX == 0) {
-    hostSimdExts |= SIMDExtension_AVX;
-  }
-
-  evar = getenv("TH_NO_SSE");
-  if (evar == NULL || strncmp(evar, "1", 2) != 0)
-    TH_NO_SSE = 0;
-  if (edx & CPUID_SSE_BIT && TH_NO_SSE == 0) {
-    hostSimdExts |= SIMDExtension_SSE;
-  }
-
-  return hostSimdExts;
-}
-
-#endif // end SIMD extension detection code
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/vector/AVX.c b/contrib/lua-torch/torch7/lib/TH/vector/AVX.c
deleted file mode 100644
index 58c4e6d35..000000000
--- a/contrib/lua-torch/torch7/lib/TH/vector/AVX.c
+++ /dev/null
@@ -1,274 +0,0 @@
-#if defined(USE_AVX) && defined(__AVX__)
-#ifndef _MSC_VER
-#include <x86intrin.h>
-#else
-#include <intrin.h>
-#endif
-
-#include "AVX.h"
-
-void THDoubleVector_copy_AVX(double *y, const double *x, const ptrdiff_t n) {
-  ptrdiff_t i;
-  ptrdiff_t off;
-  for (i=0; i<=((n)-8); i+=8) {
-    _mm256_storeu_pd(y+i, _mm256_loadu_pd(x+i));
-    _mm256_storeu_pd(y+i+4, _mm256_loadu_pd(x+i+4));
-  }
-  off = (n) - ((n)%8);
-  for (i=0; i<((n)%8); i++) {
-    y[off+i] = x[off+i];
-  }
-}
-
-void THDoubleVector_fill_AVX(double *x, const double c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  ptrdiff_t off;
-  __m256d YMM0 = _mm256_set_pd(c, c, c, c);
-  for (i=0; i<=((n)-16); i+=16) {
-    _mm256_storeu_pd((x)+i  , YMM0);
-    _mm256_storeu_pd((x)+i+4, YMM0);
-    _mm256_storeu_pd((x)+i+8, YMM0);
-    _mm256_storeu_pd((x)+i+12, YMM0);
-  }
-  off = (n) - ((n)%16);
-  for (i=0; i<((n)%16); i++) {
-    x[off+i] = c;
-  }
-}
-
-void THDoubleVector_cdiv_AVX(double *z, const double *x, const double *y, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m256d YMM0, YMM1, YMM2, YMM3;
-  for (i=0; i<=((n)-8); i+=8) {
-    YMM0 = _mm256_loadu_pd(x+i);
-    YMM1 = _mm256_loadu_pd(x+i+4);
-    YMM2 = _mm256_loadu_pd(y+i);
-    YMM3 = _mm256_loadu_pd(y+i+4);
-    YMM2 = _mm256_div_pd(YMM0, YMM2);
-    YMM3 = _mm256_div_pd(YMM1, YMM3);
-    _mm256_storeu_pd(z+i, YMM2);
-    _mm256_storeu_pd(z+i+4, YMM3);
-  }
-  for (; i<(n); i++) {
-    z[i] = x[i] / y[i];
-  }
-}
-
-void THDoubleVector_divs_AVX(double *y, const double *x, const double c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m256d YMM15 = _mm256_set_pd(c, c, c, c);
-  __m256d YMM0, YMM1;
-  for (i=0; i<=((n)-8); i+=8) {
-    YMM0 = _mm256_loadu_pd(x+i);
-    YMM1 = _mm256_loadu_pd(x+i+4);
-    YMM0 = _mm256_div_pd(YMM0, YMM15);
-    YMM1 = _mm256_div_pd(YMM1, YMM15);
-    _mm256_storeu_pd(y+i, YMM0);
-    _mm256_storeu_pd(y+i+4, YMM1);
-  }
-  for (; i<(n); i++) {
-    y[i] = x[i] / c;
-  }
-}
-
-void THDoubleVector_cmul_AVX(double *z, const double *x, const double *y, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m256d YMM0, YMM1, YMM2, YMM3;
-  for (i=0; i<=((n)-8); i+=8) {
-    YMM0 = _mm256_loadu_pd(x+i);
-    YMM1 = _mm256_loadu_pd(x+i+4);
-    YMM2 = _mm256_loadu_pd(y+i);
-    YMM3 = _mm256_loadu_pd(y+i+4);
-    YMM2 = _mm256_mul_pd(YMM0, YMM2);
-    YMM3 = _mm256_mul_pd(YMM1, YMM3);
-    _mm256_storeu_pd(z+i, YMM2);
-    _mm256_storeu_pd(z+i+4, YMM3);
-  }
-  for (; i<n; i++) {
-    z[i] = x[i] * y[i];
-  }
-}
-
-void THDoubleVector_muls_AVX(double *y, const double *x, const double c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m256d YMM15 = _mm256_set_pd(c, c, c, c);
-  __m256d YMM0, YMM1;
-  for (i=0; i<=((n)-8); i+=8) {
-    YMM0 = _mm256_loadu_pd(x+i);
-    YMM1 = _mm256_loadu_pd(x+i+4);
-    YMM0 = _mm256_mul_pd(YMM0, YMM15);
-    YMM1 = _mm256_mul_pd(YMM1, YMM15);
-    _mm256_storeu_pd(y+i, YMM0);
-    _mm256_storeu_pd(y+i+4, YMM1);
-  }
-  for (; i<n; i++) {
-    y[i] = x[i] * c;
-  }
-}
-
-void THDoubleVector_cadd_AVX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m256d YMM15 = _mm256_set_pd(c, c, c, c);
-  __m256d YMM0, YMM1, YMM2, YMM3;
-  for (i=0; i<=((n)-4); i+=4) {
-    YMM0 = _mm256_loadu_pd(y+i);
-    YMM1 = _mm256_loadu_pd(x+i);
-    YMM2 = _mm256_mul_pd(YMM0, YMM15);
-    YMM3 = _mm256_add_pd(YMM1, YMM2);
-    _mm256_storeu_pd(z+i, YMM3);
-  }
-  for (; i<(n); i++) {
-    z[i] = x[i] + y[i] * c;
-  }
-}
-
-void THDoubleVector_adds_AVX(double *y, const double *x, const double c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m256d YMM15 = _mm256_set_pd(c, c, c, c);
-  __m256d YMM0, YMM1;
-  for (i=0; i<=((n)-8); i+=8) {
-    YMM0 = _mm256_loadu_pd(x+i);
-    YMM1 = _mm256_loadu_pd(x+i+4);
-    YMM0 = _mm256_add_pd(YMM0, YMM15);
-    YMM1 = _mm256_add_pd(YMM1, YMM15);
-    _mm256_storeu_pd(y+i, YMM0);
-    _mm256_storeu_pd(y+i+4, YMM1);
-  }
-  for (; i<(n); i++) {
-    y[i] = x[i] + c;
-  }
-}
-
-void THFloatVector_copy_AVX(float *y, const float *x, const ptrdiff_t n) {
-  ptrdiff_t i;
-  ptrdiff_t off;
-  for (i=0; i<=((n)-16); i+=16) {
-    _mm256_storeu_ps(y+i, _mm256_loadu_ps(x+i));
-    _mm256_storeu_ps(y+i+8, _mm256_loadu_ps(x+i+8));
-  }
-  off = (n) - ((n)%16);
-  for (i=0; i<((n)%16); i++) {
-    y[off+i] = x[off+i];
-  }
-}
-
-void THFloatVector_fill_AVX(float *x, const float c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  ptrdiff_t off;
-  __m256 YMM0 = _mm256_set_ps(c, c, c, c, c, c, c, c);
-  for (i=0; i<=((n)-32); i+=32) {
-    _mm256_storeu_ps((x)+i  , YMM0);
-    _mm256_storeu_ps((x)+i+8, YMM0);
-    _mm256_storeu_ps((x)+i+16, YMM0);
-    _mm256_storeu_ps((x)+i+24, YMM0);
-  }
-  off = (n) - ((n)%32);
-  for (i=0; i<((n)%32); i++) {
-    x[off+i] = c;
-  }
-}
-
-void THFloatVector_cdiv_AVX(float *z, const float *x, const float *y, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m256 YMM0, YMM1, YMM2, YMM3;
-  for (i=0; i<=((n)-16); i+=16) {
-    YMM0 = _mm256_loadu_ps(x+i);
-    YMM1 = _mm256_loadu_ps(x+i+8);
-    YMM2 = _mm256_loadu_ps(y+i);
-    YMM3 = _mm256_loadu_ps(y+i+8);
-    YMM2 = _mm256_div_ps(YMM0, YMM2);
-    YMM3 = _mm256_div_ps(YMM1, YMM3);
-    _mm256_storeu_ps(z+i, YMM2);
-    _mm256_storeu_ps(z+i+8, YMM3);
-  }
-  for (; i<(n); i++) {
-    z[i] = x[i] / y[i];
-  }
-}
-
-void THFloatVector_divs_AVX(float *y, const float *x, const float c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c);
-  __m256 YMM0, YMM1;
-  for (i=0; i<=((n)-16); i+=16) {
-    YMM0 = _mm256_loadu_ps(x+i);
-    YMM1 = _mm256_loadu_ps(x+i+8);
-    YMM0 = _mm256_div_ps(YMM0, YMM15);
-    YMM1 = _mm256_div_ps(YMM1, YMM15);
-    _mm256_storeu_ps(y+i, YMM0);
-    _mm256_storeu_ps(y+i+8, YMM1);
-  }
-  for (; i<(n); i++) {
-    y[i] = x[i] / c;
-  }
-}
-
-void THFloatVector_cmul_AVX(float *z, const float *x, const float *y, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m256 YMM0, YMM1, YMM2, YMM3;
-  for (i=0; i<=((n)-16); i+=16) {
-    YMM0 = _mm256_loadu_ps(x+i);
-    YMM1 = _mm256_loadu_ps(x+i+8);
-    YMM2 = _mm256_loadu_ps(y+i);
-    YMM3 = _mm256_loadu_ps(y+i+8);
-    YMM2 = _mm256_mul_ps(YMM0, YMM2);
-    YMM3 = _mm256_mul_ps(YMM1, YMM3);
-    _mm256_storeu_ps(z+i, YMM2);
-    _mm256_storeu_ps(z+i+8, YMM3);
-  }
-  for (; i<n; i++) {
-    z[i] = x[i] * y[i];
-  }
-}
-
-void THFloatVector_muls_AVX(float *y, const float *x, const float c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c);
-  __m256 YMM0, YMM1;
-  for (i=0; i<=((n)-16); i+=16) {
-    YMM0 = _mm256_loadu_ps(x+i);
-    YMM1 = _mm256_loadu_ps(x+i+8);
-    YMM0 = _mm256_mul_ps(YMM0, YMM15);
-    YMM1 = _mm256_mul_ps(YMM1, YMM15);
-    _mm256_storeu_ps(y+i, YMM0);
-    _mm256_storeu_ps(y+i+8, YMM1);
-  }
-  for (; i<n; i++) {
-    y[i] = x[i] * c;
-  }
-}
-
-void THFloatVector_cadd_AVX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c);
-  __m256 YMM0, YMM1, YMM2, YMM3;
-  for (i=0; i<=((n)-8); i+=8) {
-    YMM0 = _mm256_loadu_ps(y+i);
-    YMM1 = _mm256_loadu_ps(x+i);
-    YMM2 = _mm256_mul_ps(YMM0, YMM15);
-    YMM3 = _mm256_add_ps(YMM1, YMM2);
-    _mm256_storeu_ps(z+i, YMM3);
-  }
-  for (; i<(n); i++) {
-    z[i] = x[i] + y[i] * c;
-  }
-}
-
-void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c);
-  __m256 YMM0, YMM1;
-  for (i=0; i<=((n)-16); i+=16) {
-    YMM0 = _mm256_loadu_ps(x+i);
-    YMM1 = _mm256_loadu_ps(x+i+8);
-    YMM0 = _mm256_add_ps(YMM0, YMM15);
-    YMM1 = _mm256_add_ps(YMM1, YMM15);
-    _mm256_storeu_ps(y+i, YMM0);
-    _mm256_storeu_ps(y+i+8, YMM1);
-  }
-  for (; i<(n); i++) {
-    y[i] = x[i] + c;
-  }
-}
-
-#endif // defined(__AVX__)
diff --git a/contrib/lua-torch/torch7/lib/TH/vector/AVX.h b/contrib/lua-torch/torch7/lib/TH/vector/AVX.h
deleted file mode 100644
index bfaeaa6b0..000000000
--- a/contrib/lua-torch/torch7/lib/TH/vector/AVX.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef TH_AVX_H
-#define TH_AVX_H
-
-#include <stddef.h>
-
-void THDoubleVector_copy_AVX(double *y, const double *x, const ptrdiff_t n);
-void THDoubleVector_fill_AVX(double *x, const double c, const ptrdiff_t n);
-void THDoubleVector_cdiv_AVX(double *z, const double *x, const double *y, const ptrdiff_t n);
-void THDoubleVector_divs_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
-void THDoubleVector_cmul_AVX(double *z, const double *x, const double *y, const ptrdiff_t n);
-void THDoubleVector_muls_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
-void THDoubleVector_cadd_AVX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n);
-void THDoubleVector_adds_AVX(double *y, const double *x, const double c, const ptrdiff_t n);
-void THFloatVector_copy_AVX(float *y, const float *x, const ptrdiff_t n);
-void THFloatVector_fill_AVX(float *x, const float c, const ptrdiff_t n);
-void THFloatVector_cdiv_AVX(float *z, const float *x, const float *y, const ptrdiff_t n);
-void THFloatVector_divs_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
-void THFloatVector_cmul_AVX(float *z, const float *x, const float *y, const ptrdiff_t n);
-void THFloatVector_muls_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
-void THFloatVector_cadd_AVX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n);
-void THFloatVector_adds_AVX(float *y, const float *x, const float c, const ptrdiff_t n);
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/vector/AVX2.c b/contrib/lua-torch/torch7/lib/TH/vector/AVX2.c
deleted file mode 100644
index 082a680ea..000000000
--- a/contrib/lua-torch/torch7/lib/TH/vector/AVX2.c
+++ /dev/null
@@ -1,47 +0,0 @@
-#if defined(__AVX2__)
-#ifndef _MSC_VER
-#include <x86intrin.h>
-#else
-#include <intrin.h>
-#endif
-#include "AVX2.h"
-
-void THDoubleVector_cadd_AVX2(double *z, const double *x, const double *y, const double c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m256d YMM15 = _mm256_set_pd(c, c, c, c);
-  __m256d YMM0, YMM1, YMM2, YMM3;
-  for (i=0; i<=((n)-8); i+=8) {
-    YMM0 = _mm256_loadu_pd(y+i);
-    YMM1 = _mm256_loadu_pd(y+i+4);
-    YMM2 = _mm256_loadu_pd(x+i);
-    YMM3 = _mm256_loadu_pd(x+i+4);
-    YMM2 = _mm256_fmadd_pd(YMM0, YMM15, YMM2);
-    YMM3 = _mm256_fmadd_pd(YMM1, YMM15, YMM3);
-    _mm256_storeu_pd(z+i, YMM2);
-    _mm256_storeu_pd(z+i+4, YMM3);
-  }
-  for (; i<(n); i++) {
-    z[i] = x[i] + y[i] * c;
-  }
-}
-
-void THFloatVector_cadd_AVX2(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c);
-  __m256 YMM0, YMM1, YMM2, YMM3;
-  for (i=0; i<=((n)-16); i+=16) {
-    YMM0 = _mm256_loadu_ps(y+i);
-    YMM1 = _mm256_loadu_ps(y+i+8);
-    YMM2 = _mm256_loadu_ps(x+i);
-    YMM3 = _mm256_loadu_ps(x+i+8);
-    YMM2 = _mm256_fmadd_ps(YMM0, YMM15, YMM2);
-    YMM3 = _mm256_fmadd_ps(YMM1, YMM15, YMM3);
-    _mm256_storeu_ps(z+i, YMM2);
-    _mm256_storeu_ps(z+i+8, YMM3);
-  }
-  for (; i<(n); i++) {
-    z[i] = x[i] + y[i] * c;
-  }
-}
-
-#endif // defined(__AVX2__)
diff --git a/contrib/lua-torch/torch7/lib/TH/vector/AVX2.h b/contrib/lua-torch/torch7/lib/TH/vector/AVX2.h
deleted file mode 100644
index 85a9e93ee..000000000
--- a/contrib/lua-torch/torch7/lib/TH/vector/AVX2.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef TH_AVX2_H
-#define TH_AVX2_H
-
-#include <stddef.h>
-
-void THDoubleVector_cadd_AVX2(double *z, const double *x, const double *y, const double c, const ptrdiff_t n);
-void THFloatVector_cadd_AVX2(float *z, const float *x, const float *y, const float c, const ptrdiff_t n);
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/TH/vector/NEON.c b/contrib/lua-torch/torch7/lib/TH/vector/NEON.c
deleted file mode 100644
index 7920fb13b..000000000
--- a/contrib/lua-torch/torch7/lib/TH/vector/NEON.c
+++ /dev/null
@@ -1,105 +0,0 @@
-static void THFloatVector_fill_NEON(float *x, const float c, const ptrdiff_t n) {
-  long i = 0;
-
-  for(; i < n-4; i += 4)
-  {
-    x[i] = c;
-    x[i+1] = c;
-    x[i+2] = c;
-    x[i+3] = c;
-  }
-
-  for(; i < n; i++)
-    x[i] = c;
-
-}
-
-static void THFloatVector_cmul_NEON(float *z, const float *x, const float* y, const ptrdiff_t n) {
-  long i = 0;
-
-  for(; i < n-4; i += 4)
-  {
-    z[i] = x[i] * y[i];
-    z[i+1] = x[i+1] * y[i+1];
-    z[i+2] = x[i+2] * y[i+2];
-    z[i+3] = x[i+3] * y[i+3];
-  }
-
-  for(; i < n; i++)
-    z[i] = x[i] * y[i];
-}
-
-static void THFloatVector_muls_NEON(float *y, const float *x, const float c, const ptrdiff_t n) {
-  long i = 0;
-
-  for(; i < n-4; i += 4)
-  {
-    y[i] = x[i] * c;
-    y[i+1] = x[i+1] * c;
-    y[i+2] = x[i+2] * c;
-    y[i+3] = x[i+3] * c;
-  }
-
-  for(; i < n; i++)
-    y[i] = x[i] * c;
-}
-
-static void THFloatVector_cadd_NEON(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) {
-  long i = 0;
-
-  for(;i < n-4; i += 4)
-  {
-    z[i] = x[i] + c * y[i];
-    z[i+1] = x[i+1] + c * y[i+1];
-    z[i+2] = x[i+2] + c * y[i+2];
-    z[i+3] = x[i+3] + c * y[i+3];
-  }
-
-  for(; i < n; i++)
-    z[i] = x[i] + c * y[i];
-}
-
-static void THFloatVector_adds_NEON(float *y, const float *x, const float c, const ptrdiff_t n) {
-  long i = 0;
-
-  for(;i < n-4; i += 4)
-  {
-    y[i] = x[i] + c;
-    y[i+1] = x[i+1] + c;
-    y[i+2] = x[i+2] + c;
-    y[i+3] = x[i+3] + c;
-  }
-
-  for(; i < n; i++)
-    y[i] = x[i] + c;
-}
-
-static void THFloatVector_cdiv_NEON(float *z, const float *x, const float *y, const ptrdiff_t n) {
-  long i = 0;
-
-  for(;i < n-4; i += 4)
-  {
-    z[i] = x[i] / y[i];
-    z[i+1] = x[i+1] / y[i+1];
-    z[i+2] = x[i+2] / y[i+2];
-    z[i+3] = x[i+3] / y[i+3];
-  }
-
-  for(; i < n; i++)
-    z[i] = x[i] / y[i];
-}
-
-static void THFloatVector_divs_NEON(float *y, const float *x, const float c, const ptrdiff_t n) {
-  long i = 0;
-
-  for(;i < n-4; i += 4)
-  {
-    y[i] = x[i] / c;
-    y[i+1] = x[i+1] / c;
-    y[i+2] = x[i+2] / c;
-    y[i+3] = x[i+3] / c;
-  }
-
-  for(; i < n; i++)
-    y[i] = x[i] / c;
-}
diff --git a/contrib/lua-torch/torch7/lib/TH/vector/SSE.c b/contrib/lua-torch/torch7/lib/TH/vector/SSE.c
deleted file mode 100644
index d026935ab..000000000
--- a/contrib/lua-torch/torch7/lib/TH/vector/SSE.c
+++ /dev/null
@@ -1,268 +0,0 @@
-#ifndef _MSC_VER
-#include <x86intrin.h>
-#else
-#include <intrin.h>
-#endif
-
-static void THDoubleVector_fill_SSE(double *x, const double c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  ptrdiff_t off;
-  __m128d XMM0 = _mm_set1_pd(c);
-  for (i=0; i<=((n)-8); i+=8) {
-    _mm_storeu_pd((x)+i  , XMM0);
-    _mm_storeu_pd((x)+i+2, XMM0);
-    _mm_storeu_pd((x)+i+4, XMM0);
-    _mm_storeu_pd((x)+i+6, XMM0);
-  }
-  off = (n) - ((n)%8);
-  for (i=0; i<((n)%8); i++) {
-    x[off+i] = c;
-  }
-}
-
-static void THDoubleVector_cadd_SSE(double *z, const double *x, const double *y, const double c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m128d XMM7 = _mm_set1_pd(c);
-  __m128d XMM0, XMM2;
-  for (i=0; i<=((n)-2); i+=2) {
-    XMM0 = _mm_loadu_pd((x)+i);
-    XMM2 = _mm_loadu_pd((y)+i);
-    XMM2 = _mm_mul_pd(XMM2, XMM7);
-    XMM2 = _mm_add_pd(XMM0, XMM2);
-    _mm_storeu_pd((z)+i, XMM2);
-  }
-  for (; i<(n); i++) {
-    z[i] = x[i] + c * y[i];
-  }
-}
-
-static void THDoubleVector_adds_SSE(double *y, const double *x, const double c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m128d XMM7 = _mm_set1_pd(c);
-  __m128d XMM0, XMM2;
-  for (i=0; i<=((n)-4); i+=4) {
-    XMM0 = _mm_loadu_pd((x)+i);
-    XMM2 = _mm_loadu_pd((x)+i+2);
-    XMM0 = _mm_add_pd(XMM0, XMM7);
-    XMM2 = _mm_add_pd(XMM2, XMM7);
-    _mm_storeu_pd((y)+i, XMM0);
-    _mm_storeu_pd((y)+i+2, XMM2);
-  }
-  for (; i<(n); i++) {
-    y[i] = x[i] + c;
-  }
-}
-
-static void THDoubleVector_cmul_SSE(double *z, const double *x, const double *y, const ptrdiff_t n) {
-  ptrdiff_t i;
-  for (i=0; i<=((n)-8); i+=8) {
-    __m128d XMM0 = _mm_loadu_pd((x)+i  );
-    __m128d XMM1 = _mm_loadu_pd((x)+i+2);
-    __m128d XMM2 = _mm_loadu_pd((x)+i+4);
-    __m128d XMM3 = _mm_loadu_pd((x)+i+6);
-    __m128d XMM4 = _mm_loadu_pd((y)+i  );
-    __m128d XMM5 = _mm_loadu_pd((y)+i+2);
-    __m128d XMM6 = _mm_loadu_pd((y)+i+4);
-    __m128d XMM7 = _mm_loadu_pd((y)+i+6);
-    XMM4 = _mm_mul_pd(XMM4, XMM0);
-    XMM5 = _mm_mul_pd(XMM5, XMM1);
-    XMM6 = _mm_mul_pd(XMM6, XMM2);
-    XMM7 = _mm_mul_pd(XMM7, XMM3);
-    _mm_storeu_pd((z)+i  , XMM4);
-    _mm_storeu_pd((z)+i+2, XMM5);
-    _mm_storeu_pd((z)+i+4, XMM6);
-    _mm_storeu_pd((z)+i+6, XMM7);
-  }
-  for (; i<(n); i++) {
-    z[i] = x[i] * y[i];
-  }
-}
-
-static void THDoubleVector_muls_SSE(double *y, const double *x, const double c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m128d XMM15 = _mm_set1_pd(c);
-  for (i=0; i<=((n)-8); i+=8) {
-    __m128d XMM0 = _mm_loadu_pd((x)+i  );
-    __m128d XMM1 = _mm_loadu_pd((x)+i+2);
-    __m128d XMM2 = _mm_loadu_pd((x)+i+4);
-    __m128d XMM3 = _mm_loadu_pd((x)+i+6);
-    __m128d XMM4 = _mm_mul_pd(XMM15, XMM0);
-    __m128d XMM5 = _mm_mul_pd(XMM15, XMM1);
-    __m128d XMM6 = _mm_mul_pd(XMM15, XMM2);
-    __m128d XMM7 = _mm_mul_pd(XMM15, XMM3);
-    _mm_storeu_pd((y)+i  , XMM4);
-    _mm_storeu_pd((y)+i+2, XMM5);
-    _mm_storeu_pd((y)+i+4, XMM6);
-    _mm_storeu_pd((y)+i+6, XMM7);
-  }
-  for (; i<(n); i++) {
-    y[i] = x[i] * c;
-  }
-}
-
-static void THDoubleVector_cdiv_SSE(double *z, const double *x, const double *y, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m128d XMM0, XMM1, XMM2, XMM3;
-  for (i=0; i<=((n)-4); i+=4) {
-    XMM0 = _mm_loadu_pd(x+i);
-    XMM1 = _mm_loadu_pd(x+i+2);
-    XMM2 = _mm_loadu_pd(y+i);
-    XMM3 = _mm_loadu_pd(y+i+2);
-    XMM2 = _mm_div_pd(XMM0, XMM2);
-    XMM3 = _mm_div_pd(XMM1, XMM3);
-    _mm_storeu_pd(z+i, XMM2);
-    _mm_storeu_pd(z+i+2, XMM3);
-  }
-  for (; i<(n); i++) {
-    z[i] = x[i] / y[i];
-  }
-}
-
-static void THDoubleVector_divs_SSE(double *y, const double *x, const double c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m128d XMM7 = _mm_set1_pd(c);
-  __m128d XMM0, XMM1;
-  for (i=0; i<=((n)-4); i+=4) {
-    XMM0 = _mm_loadu_pd(x+i);
-    XMM1 = _mm_loadu_pd(x+i+2);
-    XMM0 = _mm_div_pd(XMM0, XMM7);
-    XMM1 = _mm_div_pd(XMM1, XMM7);
-    _mm_storeu_pd(y+i, XMM0);
-    _mm_storeu_pd(y+i+2, XMM1);
-  }
-  for (; i<(n); i++) {
-    y[i] = x[i] / c;
-  }
-}
-
-static void THFloatVector_fill_SSE(float *x, const float c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m128 XMM0 = _mm_set_ps1(c);
-  ptrdiff_t off;
-  for (i=0; i<=((n)-16); i+=16) {
-    _mm_storeu_ps((x)+i  ,  XMM0);
-    _mm_storeu_ps((x)+i+4,  XMM0);
-    _mm_storeu_ps((x)+i+8,  XMM0);
-    _mm_storeu_ps((x)+i+12, XMM0);
-  }
-  off = (n) - ((n)%16);
-  for (i=0; i<((n)%16); i++) {
-    x[off+i] = c;
-  }
-}
-
-
-static void THFloatVector_cadd_SSE(float *z, const float *x, const float *y, const float c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m128 XMM7 = _mm_set_ps1(c);
-  __m128 XMM0, XMM2;
-  for (i=0; i<=((n)-4); i+=4) {
-    XMM0 = _mm_loadu_ps((x)+i);
-    XMM2 = _mm_loadu_ps((y)+i);
-    XMM2 = _mm_mul_ps(XMM2, XMM7);
-    XMM2 = _mm_add_ps(XMM0, XMM2);
-    _mm_storeu_ps((z)+i, XMM2);
-  }
-  for (; i<(n); i++) {
-    z[i] = x[i] + c * y[i];
-  }
-}
-
-static void THFloatVector_adds_SSE(float *y, const float *x, const float c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m128 XMM7 = _mm_set1_ps(c);
-  __m128 XMM0, XMM2;
-  for (i=0; i<=((n)-8); i+=8) {
-    XMM0 = _mm_loadu_ps((x)+i);
-    XMM2 = _mm_loadu_ps((x)+i+4);
-    XMM0 = _mm_add_ps(XMM0, XMM7);
-    XMM2 = _mm_add_ps(XMM2, XMM7);
-    _mm_storeu_ps((y)+i, XMM0);
-    _mm_storeu_ps((y)+i+4, XMM2);
-  }
-  for (; i<(n); i++) {
-    y[i] = x[i] + c;
-  }
-}
-
-static void THFloatVector_cmul_SSE(float *z, const float *x, const float *y, const ptrdiff_t n) {
-  ptrdiff_t i;
-  for (i=0; i<=((n)-16); i+=16) {
-    __m128 XMM0 = _mm_loadu_ps((x)+i   );
-    __m128 XMM1 = _mm_loadu_ps((x)+i+ 4);
-    __m128 XMM2 = _mm_loadu_ps((x)+i+ 8);
-    __m128 XMM3 = _mm_loadu_ps((x)+i+12);
-    __m128 XMM4 = _mm_loadu_ps((y)+i   );
-    __m128 XMM5 = _mm_loadu_ps((y)+i+ 4);
-    __m128 XMM6 = _mm_loadu_ps((y)+i+ 8);
-    __m128 XMM7 = _mm_loadu_ps((y)+i+12);
-    XMM4 = _mm_mul_ps(XMM4, XMM0);
-    XMM5 = _mm_mul_ps(XMM5, XMM1);
-    XMM6 = _mm_mul_ps(XMM6, XMM2);
-    XMM7 = _mm_mul_ps(XMM7, XMM3);
-    _mm_storeu_ps((z)+i   , XMM4);
-    _mm_storeu_ps((z)+i+ 4, XMM5);
-    _mm_storeu_ps((z)+i+ 8, XMM6);
-    _mm_storeu_ps((z)+i+12, XMM7);
-  }
-  for (; i<(n); i++) {
-    z[i] = x[i] * y[i];
-  }
-}
-
-static void THFloatVector_muls_SSE(float *y, const float *x, const float c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m128 XMM15 = _mm_set_ps1(c);
-  for (i=0; i<=((n)-16); i+=16) {
-    __m128 XMM0 = _mm_loadu_ps((x)+i   );
-    __m128 XMM1 = _mm_loadu_ps((x)+i+ 4);
-    __m128 XMM2 = _mm_loadu_ps((x)+i+ 8);
-    __m128 XMM3 = _mm_loadu_ps((x)+i+12);
-    __m128 XMM4 = _mm_mul_ps(XMM15, XMM0);
-    __m128 XMM5 = _mm_mul_ps(XMM15, XMM1);
-    __m128 XMM6 = _mm_mul_ps(XMM15, XMM2);
-    __m128 XMM7 = _mm_mul_ps(XMM15, XMM3);
-    _mm_storeu_ps((y)+i   , XMM4);
-    _mm_storeu_ps((y)+i+ 4, XMM5);
-    _mm_storeu_ps((y)+i+ 8, XMM6);
-    _mm_storeu_ps((y)+i+12, XMM7);
-  }
-  for (; i<(n); i++) {
-    y[i] = x[i] * c;
-  }
-}
-
-static void THFloatVector_cdiv_SSE(float *z, const float *x, const float *y, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m128 XMM0, XMM1, XMM2, XMM3;
-  for (i=0; i<=((n)-8); i+=8) {
-    XMM0 = _mm_loadu_ps(x+i);
-    XMM1 = _mm_loadu_ps(x+i+4);
-    XMM2 = _mm_loadu_ps(y+i);
-    XMM3 = _mm_loadu_ps(y+i+4);
-    XMM2 = _mm_div_ps(XMM0, XMM2);
-    XMM3 = _mm_div_ps(XMM1, XMM3);
-    _mm_storeu_ps(z+i, XMM2);
-    _mm_storeu_ps(z+i+4, XMM3);
-  }
-  for (; i<(n); i++) {
-    z[i] = x[i] / y[i];
-  }
-}
-
-static void THFloatVector_divs_SSE(float *y, const float *x, const float c, const ptrdiff_t n) {
-  ptrdiff_t i;
-  __m128 XMM7 = _mm_set1_ps(c);
-  __m128 XMM0, XMM1;
-  for (i=0; i<=((n)-8); i+=8) {
-    XMM0 = _mm_loadu_ps(x+i);
-    XMM1 = _mm_loadu_ps(x+i+4);
-    XMM0 = _mm_div_ps(XMM0, XMM7);
-    XMM1 = _mm_div_ps(XMM1, XMM7);
-    _mm_storeu_ps(y+i, XMM0);
-    _mm_storeu_ps(y+i+4, XMM1);
-  }
-  for (; i<(n); i++) {
-    y[i] = x[i] / c;
-  }
-}
diff --git a/contrib/lua-torch/torch7/lib/TH/vector/VSX.c b/contrib/lua-torch/torch7/lib/TH/vector/VSX.c
deleted file mode 100644
index 9ff984ad7..000000000
--- a/contrib/lua-torch/torch7/lib/TH/vector/VSX.c
+++ /dev/null
@@ -1,2520 +0,0 @@
-#ifdef __PPC64__
-#include <altivec.h>
-#include <stddef.h>
-
-
-//--------------------------------------------------------------------------------------------------
-// THDoubleVector_fill_VSX:
-//--------------------------------------------------------------------------------------------------
-static void THDoubleVector_fill_VSX(double *x, const double c, const ptrdiff_t n)
-{
-    ptrdiff_t i;
-
-    double val[2] = {c, c};
-    vector double fp64vec2 = vec_xl(0, val);
-
-    for (i = 0; i <= n-128; i += 128)
-    {
-        vec_xst(fp64vec2, 0, x+(i    ));
-        vec_xst(fp64vec2, 0, x+(i+2  ));
-        vec_xst(fp64vec2, 0, x+(i+4  ));
-        vec_xst(fp64vec2, 0, x+(i+6  ));
-        vec_xst(fp64vec2, 0, x+(i+8  ));
-        vec_xst(fp64vec2, 0, x+(i+10 ));
-        vec_xst(fp64vec2, 0, x+(i+12 ));
-        vec_xst(fp64vec2, 0, x+(i+14 ));
-        vec_xst(fp64vec2, 0, x+(i+16 ));
-        vec_xst(fp64vec2, 0, x+(i+18 ));
-        vec_xst(fp64vec2, 0, x+(i+20 ));
-        vec_xst(fp64vec2, 0, x+(i+22 ));
-        vec_xst(fp64vec2, 0, x+(i+24 ));
-        vec_xst(fp64vec2, 0, x+(i+26 ));
-        vec_xst(fp64vec2, 0, x+(i+28 ));
-        vec_xst(fp64vec2, 0, x+(i+30 ));
-        vec_xst(fp64vec2, 0, x+(i+32 ));
-        vec_xst(fp64vec2, 0, x+(i+34 ));
-        vec_xst(fp64vec2, 0, x+(i+36 ));
-        vec_xst(fp64vec2, 0, x+(i+38 ));
-        vec_xst(fp64vec2, 0, x+(i+40 ));
-        vec_xst(fp64vec2, 0, x+(i+42 ));
-        vec_xst(fp64vec2, 0, x+(i+44 ));
-        vec_xst(fp64vec2, 0, x+(i+46 ));
-        vec_xst(fp64vec2, 0, x+(i+48 ));
-        vec_xst(fp64vec2, 0, x+(i+50 ));
-        vec_xst(fp64vec2, 0, x+(i+52 ));
-        vec_xst(fp64vec2, 0, x+(i+54 ));
-        vec_xst(fp64vec2, 0, x+(i+56 ));
-        vec_xst(fp64vec2, 0, x+(i+58 ));
-        vec_xst(fp64vec2, 0, x+(i+60 ));
-        vec_xst(fp64vec2, 0, x+(i+62 ));
-        vec_xst(fp64vec2, 0, x+(i+64 ));
-        vec_xst(fp64vec2, 0, x+(i+66 ));
-        vec_xst(fp64vec2, 0, x+(i+68 ));
-        vec_xst(fp64vec2, 0, x+(i+70 ));
-        vec_xst(fp64vec2, 0, x+(i+72 ));
-        vec_xst(fp64vec2, 0, x+(i+74 ));
-        vec_xst(fp64vec2, 0, x+(i+76 ));
-        vec_xst(fp64vec2, 0, x+(i+78 ));
-        vec_xst(fp64vec2, 0, x+(i+80 ));
-        vec_xst(fp64vec2, 0, x+(i+82 ));
-        vec_xst(fp64vec2, 0, x+(i+84 ));
-        vec_xst(fp64vec2, 0, x+(i+86 ));
-        vec_xst(fp64vec2, 0, x+(i+88 ));
-        vec_xst(fp64vec2, 0, x+(i+90 ));
-        vec_xst(fp64vec2, 0, x+(i+92 ));
-        vec_xst(fp64vec2, 0, x+(i+94 ));
-        vec_xst(fp64vec2, 0, x+(i+96 ));
-        vec_xst(fp64vec2, 0, x+(i+98 ));
-        vec_xst(fp64vec2, 0, x+(i+100));
-        vec_xst(fp64vec2, 0, x+(i+102));
-        vec_xst(fp64vec2, 0, x+(i+104));
-        vec_xst(fp64vec2, 0, x+(i+106));
-        vec_xst(fp64vec2, 0, x+(i+108));
-        vec_xst(fp64vec2, 0, x+(i+110));
-        vec_xst(fp64vec2, 0, x+(i+112));
-        vec_xst(fp64vec2, 0, x+(i+114));
-        vec_xst(fp64vec2, 0, x+(i+116));
-        vec_xst(fp64vec2, 0, x+(i+118));
-        vec_xst(fp64vec2, 0, x+(i+120));
-        vec_xst(fp64vec2, 0, x+(i+122));
-        vec_xst(fp64vec2, 0, x+(i+124));
-        vec_xst(fp64vec2, 0, x+(i+126));
-    }
-    for (; i <= n-16; i += 16)
-    {
-        vec_xst(fp64vec2, 0, x+(i    ));
-        vec_xst(fp64vec2, 0, x+(i+2  ));
-        vec_xst(fp64vec2, 0, x+(i+4  ));
-        vec_xst(fp64vec2, 0, x+(i+6  ));
-        vec_xst(fp64vec2, 0, x+(i+8  ));
-        vec_xst(fp64vec2, 0, x+(i+10 ));
-        vec_xst(fp64vec2, 0, x+(i+12 ));
-        vec_xst(fp64vec2, 0, x+(i+14 ));
-    }
-    for (; i <= n-2; i += 2)
-        vec_xst(fp64vec2, 0, x+(i    ));
-    for (; i < n; i++)
-        x[i] = c;
-}
-
-
-//--------------------------------------------------------------------------------------------------
-// THDoubleVector_cadds_VSX:
-//--------------------------------------------------------------------------------------------------
-static void THDoubleVector_cadd_VSX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n)
-{
-    ptrdiff_t i;
-
-    double val[2] = {c, c};
-    vector double c_fp64vec2 = vec_xl(0, val);
-
-    vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
-    vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
-    vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
-    vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
-
-
-    for (i = 0; i <= n-24; i += 24)
-    {
-        y0_fp64vec2  = vec_xl(0, y+(i   ));
-        y1_fp64vec2  = vec_xl(0, y+(i+2 ));
-        y2_fp64vec2  = vec_xl(0, y+(i+4 ));
-        y3_fp64vec2  = vec_xl(0, y+(i+6 ));
-        y4_fp64vec2  = vec_xl(0, y+(i+8 ));
-        y5_fp64vec2  = vec_xl(0, y+(i+10));
-        y6_fp64vec2  = vec_xl(0, y+(i+12));
-        y7_fp64vec2  = vec_xl(0, y+(i+14));
-        y8_fp64vec2  = vec_xl(0, y+(i+16));
-        y9_fp64vec2  = vec_xl(0, y+(i+18));
-        y10_fp64vec2 = vec_xl(0, y+(i+20));
-        y11_fp64vec2 = vec_xl(0, y+(i+22));
-
-        x0_fp64vec2  = vec_xl(0, x+(i   ));
-        x1_fp64vec2  = vec_xl(0, x+(i+2 ));
-        x2_fp64vec2  = vec_xl(0, x+(i+4 ));
-        x3_fp64vec2  = vec_xl(0, x+(i+6 ));
-        x4_fp64vec2  = vec_xl(0, x+(i+8 ));
-        x5_fp64vec2  = vec_xl(0, x+(i+10));
-        x6_fp64vec2  = vec_xl(0, x+(i+12));
-        x7_fp64vec2  = vec_xl(0, x+(i+14));
-        x8_fp64vec2  = vec_xl(0, x+(i+16));
-        x9_fp64vec2  = vec_xl(0, x+(i+18));
-        x10_fp64vec2 = vec_xl(0, x+(i+20));
-        x11_fp64vec2 = vec_xl(0, x+(i+22));
-
-        y0_fp64vec2  = vec_madd(y0_fp64vec2, c_fp64vec2,  x0_fp64vec2);
-        y1_fp64vec2  = vec_madd(y1_fp64vec2, c_fp64vec2, x1_fp64vec2);
-        y2_fp64vec2  = vec_madd(y2_fp64vec2, c_fp64vec2, x2_fp64vec2);
-        y3_fp64vec2  = vec_madd(y3_fp64vec2, c_fp64vec2, x3_fp64vec2);
-        y4_fp64vec2  = vec_madd(y4_fp64vec2, c_fp64vec2, x4_fp64vec2);
-        y5_fp64vec2  = vec_madd(y5_fp64vec2, c_fp64vec2, x5_fp64vec2);
-        y6_fp64vec2  = vec_madd(y6_fp64vec2, c_fp64vec2, x6_fp64vec2);
-        y7_fp64vec2  = vec_madd(y7_fp64vec2, c_fp64vec2, x7_fp64vec2);
-        y8_fp64vec2  = vec_madd(y8_fp64vec2, c_fp64vec2, x8_fp64vec2);
-        y9_fp64vec2  = vec_madd(y9_fp64vec2, c_fp64vec2, x9_fp64vec2);
-        y10_fp64vec2 = vec_madd(y10_fp64vec2, c_fp64vec2,x10_fp64vec2);
-        y11_fp64vec2 = vec_madd(y11_fp64vec2, c_fp64vec2,x11_fp64vec2);
-
-        vec_xst(y0_fp64vec2,  0, z+(i   ));
-        vec_xst(y1_fp64vec2,  0, z+(i+2 ));
-        vec_xst(y2_fp64vec2,  0, z+(i+4 ));
-        vec_xst(y3_fp64vec2,  0, z+(i+6 ));
-        vec_xst(y4_fp64vec2,  0, z+(i+8 ));
-        vec_xst(y5_fp64vec2,  0, z+(i+10));
-        vec_xst(y6_fp64vec2,  0, z+(i+12));
-        vec_xst(y7_fp64vec2,  0, z+(i+14));
-        vec_xst(y8_fp64vec2,  0, z+(i+16));
-        vec_xst(y9_fp64vec2,  0, z+(i+18));
-        vec_xst(y10_fp64vec2, 0, z+(i+20));
-        vec_xst(y11_fp64vec2, 0, z+(i+22));
-    }
-    for (; i <= n-8; i += 8)
-    {
-        y0_fp64vec2  = vec_xl(0, y+(i   ));
-        y1_fp64vec2  = vec_xl(0, y+(i+2 ));
-        y2_fp64vec2  = vec_xl(0, y+(i+4 ));
-        y3_fp64vec2  = vec_xl(0, y+(i+6 ));
-
-        x0_fp64vec2  = vec_xl(0, x+(i   ));
-        x1_fp64vec2  = vec_xl(0, x+(i+2 ));
-        x2_fp64vec2  = vec_xl(0, x+(i+4 ));
-        x3_fp64vec2  = vec_xl(0, x+(i+6 ));
-
-        y0_fp64vec2  = vec_madd(y0_fp64vec2, c_fp64vec2, x0_fp64vec2);
-        y1_fp64vec2  = vec_madd(y1_fp64vec2, c_fp64vec2, x1_fp64vec2);
-        y2_fp64vec2  = vec_madd(y2_fp64vec2, c_fp64vec2, x2_fp64vec2);
-        y3_fp64vec2  = vec_madd(y3_fp64vec2, c_fp64vec2, x3_fp64vec2);
-
-        vec_xst(y0_fp64vec2,  0, z+(i   ));
-        vec_xst(y1_fp64vec2,  0, z+(i+2 ));
-        vec_xst(y2_fp64vec2,  0, z+(i+4 ));
-        vec_xst(y3_fp64vec2,  0, z+(i+6 ));
-    }
-    for (; i <= n-2; i += 2)
-    {
-        y0_fp64vec2  = vec_xl(0, y+(i   ));
-        x0_fp64vec2  = vec_xl(0, x+(i   ));
-        y0_fp64vec2  = vec_madd(y0_fp64vec2, c_fp64vec2, x0_fp64vec2);
-        vec_xst(y0_fp64vec2,  0, z+(i   ));
-    }
-    for (; i < n; i++)
-        z[i] = x[i] + c* y[i];
-}
-
-
-//--------------------------------------------------------------------------------------------------
-// THDoubleVector_adds_VSX:
-//--------------------------------------------------------------------------------------------------
-static void THDoubleVector_adds_VSX(double *y, const double *x, const double c, const ptrdiff_t n)
-{
-    ptrdiff_t i;
-
-    double val[2] = {c, c};
-    vector double c_fp64vec2 = vec_xl(0, val);
-
-    vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
-    vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
-    vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
-    vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
-
-
-    for (i = 0; i <= n-24; i += 24)
-    {
-        x0_fp64vec2  = vec_xl(0, x+(i   ));
-        x1_fp64vec2  = vec_xl(0, x+(i+2 ));
-        x2_fp64vec2  = vec_xl(0, x+(i+4 ));
-        x3_fp64vec2  = vec_xl(0, x+(i+6 ));
-        x4_fp64vec2  = vec_xl(0, x+(i+8 ));
-        x5_fp64vec2  = vec_xl(0, x+(i+10));
-        x6_fp64vec2  = vec_xl(0, x+(i+12));
-        x7_fp64vec2  = vec_xl(0, x+(i+14));
-        x8_fp64vec2  = vec_xl(0, x+(i+16));
-        x9_fp64vec2  = vec_xl(0, x+(i+18));
-        x10_fp64vec2 = vec_xl(0, x+(i+20));
-        x11_fp64vec2 = vec_xl(0, x+(i+22));
-
-        y0_fp64vec2  = vec_add(x0_fp64vec2,  c_fp64vec2);
-        y1_fp64vec2  = vec_add(x1_fp64vec2,  c_fp64vec2);
-        y2_fp64vec2  = vec_add(x2_fp64vec2,  c_fp64vec2);
-        y3_fp64vec2  = vec_add(x3_fp64vec2,  c_fp64vec2);
-        y4_fp64vec2  = vec_add(x4_fp64vec2,  c_fp64vec2);
-        y5_fp64vec2  = vec_add(x5_fp64vec2,  c_fp64vec2);
-        y6_fp64vec2  = vec_add(x6_fp64vec2,  c_fp64vec2);
-        y7_fp64vec2  = vec_add(x7_fp64vec2,  c_fp64vec2);
-        y8_fp64vec2  = vec_add(x8_fp64vec2,  c_fp64vec2);
-        y9_fp64vec2  = vec_add(x9_fp64vec2,  c_fp64vec2);
-        y10_fp64vec2 = vec_add(x10_fp64vec2, c_fp64vec2);
-        y11_fp64vec2 = vec_add(x11_fp64vec2, c_fp64vec2);
-
-
-        vec_xst(y0_fp64vec2,  0, y+(i   ));
-        vec_xst(y1_fp64vec2,  0, y+(i+2 ));
-        vec_xst(y2_fp64vec2,  0, y+(i+4 ));
-        vec_xst(y3_fp64vec2,  0, y+(i+6 ));
-        vec_xst(y4_fp64vec2,  0, y+(i+8 ));
-        vec_xst(y5_fp64vec2,  0, y+(i+10));
-        vec_xst(y6_fp64vec2,  0, y+(i+12));
-        vec_xst(y7_fp64vec2,  0, y+(i+14));
-        vec_xst(y8_fp64vec2,  0, y+(i+16));
-        vec_xst(y9_fp64vec2,  0, y+(i+18));
-        vec_xst(y10_fp64vec2, 0, y+(i+20));
-        vec_xst(y11_fp64vec2, 0, y+(i+22));
-    }
-    for (; i <= n-8; i += 8)
-    {
-        x0_fp64vec2  = vec_xl(0, x+(i   ));
-        x1_fp64vec2  = vec_xl(0, x+(i+2 ));
-        x2_fp64vec2  = vec_xl(0, x+(i+4 ));
-        x3_fp64vec2  = vec_xl(0, x+(i+6 ));
-
-        y0_fp64vec2  = vec_add(x0_fp64vec2,  c_fp64vec2);
-        y1_fp64vec2  = vec_add(x1_fp64vec2,  c_fp64vec2);
-        y2_fp64vec2  = vec_add(x2_fp64vec2,  c_fp64vec2);
-        y3_fp64vec2  = vec_add(x3_fp64vec2,  c_fp64vec2);
-
-        vec_xst(y0_fp64vec2,  0, y+(i   ));
-        vec_xst(y1_fp64vec2,  0, y+(i+2 ));
-        vec_xst(y2_fp64vec2,  0, y+(i+4 ));
-        vec_xst(y3_fp64vec2,  0, y+(i+6 ));
-    }
-    for (; i <= n-2; i += 2)
-    {
-        x0_fp64vec2  = vec_xl(0, x+(i   ));
-        y0_fp64vec2  = vec_add(x0_fp64vec2,  c_fp64vec2);
-        vec_xst(y0_fp64vec2,  0, y+(i   ));
-    }
-    for (; i < n; i++)
-        y[i] = x[i] +c;
-}
-
-
-//--------------------------------------------------------------------------------------------------
-// THDoubleVector_cmul_VSX:
-//--------------------------------------------------------------------------------------------------
-static void THDoubleVector_cmul_VSX(double *z, const double *x, const double *y, const ptrdiff_t n)
-{
-    ptrdiff_t i;
-
-    vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
-    vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
-    vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
-    vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
-
-
-    for (i = 0; i <= n-24; i += 24)
-    {
-        y0_fp64vec2  = vec_xl(0, y+(i   ));
-        y1_fp64vec2  = vec_xl(0, y+(i+2 ));
-        y2_fp64vec2  = vec_xl(0, y+(i+4 ));
-        y3_fp64vec2  = vec_xl(0, y+(i+6 ));
-        y4_fp64vec2  = vec_xl(0, y+(i+8 ));
-        y5_fp64vec2  = vec_xl(0, y+(i+10));
-        y6_fp64vec2  = vec_xl(0, y+(i+12));
-        y7_fp64vec2  = vec_xl(0, y+(i+14));
-        y8_fp64vec2  = vec_xl(0, y+(i+16));
-        y9_fp64vec2  = vec_xl(0, y+(i+18));
-        y10_fp64vec2 = vec_xl(0, y+(i+20));
-        y11_fp64vec2 = vec_xl(0, y+(i+22));
-
-        x0_fp64vec2  = vec_xl(0, x+(i   ));
-        x1_fp64vec2  = vec_xl(0, x+(i+2 ));
-        x2_fp64vec2  = vec_xl(0, x+(i+4 ));
-        x3_fp64vec2  = vec_xl(0, x+(i+6 ));
-        x4_fp64vec2  = vec_xl(0, x+(i+8 ));
-        x5_fp64vec2  = vec_xl(0, x+(i+10));
-        x6_fp64vec2  = vec_xl(0, x+(i+12));
-        x7_fp64vec2  = vec_xl(0, x+(i+14));
-        x8_fp64vec2  = vec_xl(0, x+(i+16));
-        x9_fp64vec2  = vec_xl(0, x+(i+18));
-        x10_fp64vec2 = vec_xl(0, x+(i+20));
-        x11_fp64vec2 = vec_xl(0, x+(i+22));
-
-        y0_fp64vec2  = vec_mul(y0_fp64vec2,  x0_fp64vec2);
-        y1_fp64vec2  = vec_mul(y1_fp64vec2,  x1_fp64vec2);
-        y2_fp64vec2  = vec_mul(y2_fp64vec2,  x2_fp64vec2);
-        y3_fp64vec2  = vec_mul(y3_fp64vec2,  x3_fp64vec2);
-        y4_fp64vec2  = vec_mul(y4_fp64vec2,  x4_fp64vec2);
-        y5_fp64vec2  = vec_mul(y5_fp64vec2,  x5_fp64vec2);
-        y6_fp64vec2  = vec_mul(y6_fp64vec2,  x6_fp64vec2);
-        y7_fp64vec2  = vec_mul(y7_fp64vec2,  x7_fp64vec2);
-        y8_fp64vec2  = vec_mul(y8_fp64vec2,  x8_fp64vec2);
-        y9_fp64vec2  = vec_mul(y9_fp64vec2,  x9_fp64vec2);
-        y10_fp64vec2 = vec_mul(y10_fp64vec2, x10_fp64vec2);
-        y11_fp64vec2 = vec_mul(y11_fp64vec2, x11_fp64vec2);
-
-        vec_xst(y0_fp64vec2,  0, z+(i   ));
-        vec_xst(y1_fp64vec2,  0, z+(i+2 ));
-        vec_xst(y2_fp64vec2,  0, z+(i+4 ));
-        vec_xst(y3_fp64vec2,  0, z+(i+6 ));
-        vec_xst(y4_fp64vec2,  0, z+(i+8 ));
-        vec_xst(y5_fp64vec2,  0, z+(i+10));
-        vec_xst(y6_fp64vec2,  0, z+(i+12));
-        vec_xst(y7_fp64vec2,  0, z+(i+14));
-        vec_xst(y8_fp64vec2,  0, z+(i+16));
-        vec_xst(y9_fp64vec2,  0, z+(i+18));
-        vec_xst(y10_fp64vec2, 0, z+(i+20));
-        vec_xst(y11_fp64vec2, 0, z+(i+22));
-    }
-    for (; i <= n-8; i += 8)
-    {
-        y0_fp64vec2  = vec_xl(0, y+(i   ));
-        y1_fp64vec2  = vec_xl(0, y+(i+2 ));
-        y2_fp64vec2  = vec_xl(0, y+(i+4 ));
-        y3_fp64vec2  = vec_xl(0, y+(i+6 ));
-
-        x0_fp64vec2  = vec_xl(0, x+(i   ));
-        x1_fp64vec2  = vec_xl(0, x+(i+2 ));
-        x2_fp64vec2  = vec_xl(0, x+(i+4 ));
-        x3_fp64vec2  = vec_xl(0, x+(i+6 ));
-
-        y0_fp64vec2  = vec_mul(y0_fp64vec2,  x0_fp64vec2);
-        y1_fp64vec2  = vec_mul(y1_fp64vec2,  x1_fp64vec2);
-        y2_fp64vec2  = vec_mul(y2_fp64vec2,  x2_fp64vec2);
-        y3_fp64vec2  = vec_mul(y3_fp64vec2,  x3_fp64vec2);
-
-        vec_xst(y0_fp64vec2,  0, z+(i   ));
-        vec_xst(y1_fp64vec2,  0, z+(i+2 ));
-        vec_xst(y2_fp64vec2,  0, z+(i+4 ));
-        vec_xst(y3_fp64vec2,  0, z+(i+6 ));
-    }
-    for (; i <= n-2; i += 2)
-    {
-        y0_fp64vec2  = vec_xl(0, y+(i   ));
-        x0_fp64vec2  = vec_xl(0, x+(i   ));
-        y0_fp64vec2  = vec_mul(y0_fp64vec2,  x0_fp64vec2);
-        vec_xst(y0_fp64vec2,  0, z+(i   ));
-    }
-    for (; i < n; i++)
-        z[i] = x[i] * y[i];
-}
-
-
-//--------------------------------------------------------------------------------------------------
-// THDoubleVector_muls_VSX:
-//--------------------------------------------------------------------------------------------------
-static void THDoubleVector_muls_VSX(double *y, const double *x, const double c, const ptrdiff_t n)
-{
-    ptrdiff_t i;
-
-    double val[2] = {c, c};
-    vector double c_fp64vec2 = vec_xl(0, val);
-
-    vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
-    vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
-    vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
-    vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
-
-
-    for (i = 0; i <= n-24; i += 24)
-    {
-        x0_fp64vec2  = vec_xl(0, x+(i   ));
-        x1_fp64vec2  = vec_xl(0, x+(i+2 ));
-        x2_fp64vec2  = vec_xl(0, x+(i+4 ));
-        x3_fp64vec2  = vec_xl(0, x+(i+6 ));
-        x4_fp64vec2  = vec_xl(0, x+(i+8 ));
-        x5_fp64vec2  = vec_xl(0, x+(i+10));
-        x6_fp64vec2  = vec_xl(0, x+(i+12));
-        x7_fp64vec2  = vec_xl(0, x+(i+14));
-        x8_fp64vec2  = vec_xl(0, x+(i+16));
-        x9_fp64vec2  = vec_xl(0, x+(i+18));
-        x10_fp64vec2 = vec_xl(0, x+(i+20));
-        x11_fp64vec2 = vec_xl(0, x+(i+22));
-
-        y0_fp64vec2  = vec_mul(x0_fp64vec2,  c_fp64vec2);
-        y1_fp64vec2  = vec_mul(x1_fp64vec2,  c_fp64vec2);
-        y2_fp64vec2  = vec_mul(x2_fp64vec2,  c_fp64vec2);
-        y3_fp64vec2  = vec_mul(x3_fp64vec2,  c_fp64vec2);
-        y4_fp64vec2  = vec_mul(x4_fp64vec2,  c_fp64vec2);
-        y5_fp64vec2  = vec_mul(x5_fp64vec2,  c_fp64vec2);
-        y6_fp64vec2  = vec_mul(x6_fp64vec2,  c_fp64vec2);
-        y7_fp64vec2  = vec_mul(x7_fp64vec2,  c_fp64vec2);
-        y8_fp64vec2  = vec_mul(x8_fp64vec2,  c_fp64vec2);
-        y9_fp64vec2  = vec_mul(x9_fp64vec2,  c_fp64vec2);
-        y10_fp64vec2 = vec_mul(x10_fp64vec2, c_fp64vec2);
-        y11_fp64vec2 = vec_mul(x11_fp64vec2, c_fp64vec2);
-
-
-        vec_xst(y0_fp64vec2,  0, y+(i   ));
-        vec_xst(y1_fp64vec2,  0, y+(i+2 ));
-        vec_xst(y2_fp64vec2,  0, y+(i+4 ));
-        vec_xst(y3_fp64vec2,  0, y+(i+6 ));
-        vec_xst(y4_fp64vec2,  0, y+(i+8 ));
-        vec_xst(y5_fp64vec2,  0, y+(i+10));
-        vec_xst(y6_fp64vec2,  0, y+(i+12));
-        vec_xst(y7_fp64vec2,  0, y+(i+14));
-        vec_xst(y8_fp64vec2,  0, y+(i+16));
-        vec_xst(y9_fp64vec2,  0, y+(i+18));
-        vec_xst(y10_fp64vec2, 0, y+(i+20));
-        vec_xst(y11_fp64vec2, 0, y+(i+22));
-    }
-    for (; i <= n-8; i += 8)
-    {
-        x0_fp64vec2  = vec_xl(0, x+(i   ));
-        x1_fp64vec2  = vec_xl(0, x+(i+2 ));
-        x2_fp64vec2  = vec_xl(0, x+(i+4 ));
-        x3_fp64vec2  = vec_xl(0, x+(i+6 ));
-
-        y0_fp64vec2  = vec_mul(x0_fp64vec2,  c_fp64vec2);
-        y1_fp64vec2  = vec_mul(x1_fp64vec2,  c_fp64vec2);
-        y2_fp64vec2  = vec_mul(x2_fp64vec2,  c_fp64vec2);
-        y3_fp64vec2  = vec_mul(x3_fp64vec2,  c_fp64vec2);
-
-        vec_xst(y0_fp64vec2,  0, y+(i   ));
-        vec_xst(y1_fp64vec2,  0, y+(i+2 ));
-        vec_xst(y2_fp64vec2,  0, y+(i+4 ));
-        vec_xst(y3_fp64vec2,  0, y+(i+6 ));
-    }
-    for (; i <= n-2; i += 2)
-    {
-        x0_fp64vec2  = vec_xl(0, x+(i   ));
-        y0_fp64vec2  = vec_mul(x0_fp64vec2,  c_fp64vec2);
-        vec_xst(y0_fp64vec2,  0, y+(i   ));
-    }
-    for (; i < n; i++)
-        y[i] = c * x[i];
-}
-
-
-//--------------------------------------------------------------------------------------------------
-// THDoubleVector_cdiv_VSX:
-//--------------------------------------------------------------------------------------------------
-static void THDoubleVector_cdiv_VSX(double *z, const double *x, const double *y, const ptrdiff_t n)
-{
-    ptrdiff_t i;
-
-    vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
-    vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
-    vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
-    vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
-
-
-    for (i = 0; i <= n-24; i += 24)
-    {
-        y0_fp64vec2  = vec_xl(0, y+(i   ));
-        y1_fp64vec2  = vec_xl(0, y+(i+2 ));
-        y2_fp64vec2  = vec_xl(0, y+(i+4 ));
-        y3_fp64vec2  = vec_xl(0, y+(i+6 ));
-        y4_fp64vec2  = vec_xl(0, y+(i+8 ));
-        y5_fp64vec2  = vec_xl(0, y+(i+10));
-        y6_fp64vec2  = vec_xl(0, y+(i+12));
-        y7_fp64vec2  = vec_xl(0, y+(i+14));
-        y8_fp64vec2  = vec_xl(0, y+(i+16));
-        y9_fp64vec2  = vec_xl(0, y+(i+18));
-        y10_fp64vec2 = vec_xl(0, y+(i+20));
-        y11_fp64vec2 = vec_xl(0, y+(i+22));
-
-        x0_fp64vec2  = vec_xl(0, x+(i   ));
-        x1_fp64vec2  = vec_xl(0, x+(i+2 ));
-        x2_fp64vec2  = vec_xl(0, x+(i+4 ));
-        x3_fp64vec2  = vec_xl(0, x+(i+6 ));
-        x4_fp64vec2  = vec_xl(0, x+(i+8 ));
-        x5_fp64vec2  = vec_xl(0, x+(i+10));
-        x6_fp64vec2  = vec_xl(0, x+(i+12));
-        x7_fp64vec2  = vec_xl(0, x+(i+14));
-        x8_fp64vec2  = vec_xl(0, x+(i+16));
-        x9_fp64vec2  = vec_xl(0, x+(i+18));
-        x10_fp64vec2 = vec_xl(0, x+(i+20));
-        x11_fp64vec2 = vec_xl(0, x+(i+22));
-
-        y0_fp64vec2  = vec_div(x0_fp64vec2,  y0_fp64vec2);
-        y1_fp64vec2  = vec_div(x1_fp64vec2,  y1_fp64vec2);
-        y2_fp64vec2  = vec_div(x2_fp64vec2,  y2_fp64vec2);
-        y3_fp64vec2  = vec_div(x3_fp64vec2,  y3_fp64vec2);
-        y4_fp64vec2  = vec_div(x4_fp64vec2,  y4_fp64vec2);
-        y5_fp64vec2  = vec_div(x5_fp64vec2,  y5_fp64vec2);
-        y6_fp64vec2  = vec_div(x6_fp64vec2,  y6_fp64vec2);
-        y7_fp64vec2  = vec_div(x7_fp64vec2,  y7_fp64vec2);
-        y8_fp64vec2  = vec_div(x8_fp64vec2,  y8_fp64vec2);
-        y9_fp64vec2  = vec_div(x9_fp64vec2,  y9_fp64vec2);
-        y10_fp64vec2 = vec_div(x10_fp64vec2, y10_fp64vec2);
-        y11_fp64vec2 = vec_div(x11_fp64vec2, y11_fp64vec2);
-
-        vec_xst(y0_fp64vec2,  0, z+(i   ));
-        vec_xst(y1_fp64vec2,  0, z+(i+2 ));
-        vec_xst(y2_fp64vec2,  0, z+(i+4 ));
-        vec_xst(y3_fp64vec2,  0, z+(i+6 ));
-        vec_xst(y4_fp64vec2,  0, z+(i+8 ));
-        vec_xst(y5_fp64vec2,  0, z+(i+10));
-        vec_xst(y6_fp64vec2,  0, z+(i+12));
-        vec_xst(y7_fp64vec2,  0, z+(i+14));
-        vec_xst(y8_fp64vec2,  0, z+(i+16));
-        vec_xst(y9_fp64vec2,  0, z+(i+18));
-        vec_xst(y10_fp64vec2, 0, z+(i+20));
-        vec_xst(y11_fp64vec2, 0, z+(i+22));
-    }
-    for (; i <= n-8; i += 8)
-    {
-        y0_fp64vec2  = vec_xl(0, y+(i   ));
-        y1_fp64vec2  = vec_xl(0, y+(i+2 ));
-        y2_fp64vec2  = vec_xl(0, y+(i+4 ));
-        y3_fp64vec2  = vec_xl(0, y+(i+6 ));
-
-        x0_fp64vec2  = vec_xl(0, x+(i   ));
-        x1_fp64vec2  = vec_xl(0, x+(i+2 ));
-        x2_fp64vec2  = vec_xl(0, x+(i+4 ));
-        x3_fp64vec2  = vec_xl(0, x+(i+6 ));
-
-        y0_fp64vec2  = vec_div(x0_fp64vec2,  y0_fp64vec2);
-        y1_fp64vec2  = vec_div(x1_fp64vec2,  y1_fp64vec2);
-        y2_fp64vec2  = vec_div(x2_fp64vec2,  y2_fp64vec2);
-        y3_fp64vec2  = vec_div(x3_fp64vec2,  y3_fp64vec2);
-
-        vec_xst(y0_fp64vec2,  0, z+(i   ));
-        vec_xst(y1_fp64vec2,  0, z+(i+2 ));
-        vec_xst(y2_fp64vec2,  0, z+(i+4 ));
-        vec_xst(y3_fp64vec2,  0, z+(i+6 ));
-    }
-    for (; i <= n-2; i += 2)
-    {
-        y0_fp64vec2  = vec_xl(0, y+(i   ));
-        x0_fp64vec2  = vec_xl(0, x+(i   ));
-        y0_fp64vec2  = vec_div(x0_fp64vec2,  y0_fp64vec2);
-        vec_xst(y0_fp64vec2,  0, z+(i   ));
-    }
-    for (; i < n; i++)
-        z[i] = x[i] / y[i];
-}
-
-
-//--------------------------------------------------------------------------------------------------
-// THDoubleVector_divs_VSX:
-//--------------------------------------------------------------------------------------------------
-static void THDoubleVector_divs_VSX(double *y, const double *x, const double c, const ptrdiff_t n)
-{
-    ptrdiff_t i;
-
-    double val[2] = {c, c};
-    vector double c_fp64vec2 = vec_xl(0, val);
-
-    vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
-    vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
-    vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
-    vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
-
-
-    for (i = 0; i <= n-24; i += 24)
-    {
-        x0_fp64vec2  = vec_xl(0, x+(i   ));
-        x1_fp64vec2  = vec_xl(0, x+(i+2 ));
-        x2_fp64vec2  = vec_xl(0, x+(i+4 ));
-        x3_fp64vec2  = vec_xl(0, x+(i+6 ));
-        x4_fp64vec2  = vec_xl(0, x+(i+8 ));
-        x5_fp64vec2  = vec_xl(0, x+(i+10));
-        x6_fp64vec2  = vec_xl(0, x+(i+12));
-        x7_fp64vec2  = vec_xl(0, x+(i+14));
-        x8_fp64vec2  = vec_xl(0, x+(i+16));
-        x9_fp64vec2  = vec_xl(0, x+(i+18));
-        x10_fp64vec2 = vec_xl(0, x+(i+20));
-        x11_fp64vec2 = vec_xl(0, x+(i+22));
-
-        y0_fp64vec2  = vec_div(x0_fp64vec2,  c_fp64vec2);
-        y1_fp64vec2  = vec_div(x1_fp64vec2,  c_fp64vec2);
-        y2_fp64vec2  = vec_div(x2_fp64vec2,  c_fp64vec2);
-        y3_fp64vec2  = vec_div(x3_fp64vec2,  c_fp64vec2);
-        y4_fp64vec2  = vec_div(x4_fp64vec2,  c_fp64vec2);
-        y5_fp64vec2  = vec_div(x5_fp64vec2,  c_fp64vec2);
-        y6_fp64vec2  = vec_div(x6_fp64vec2,  c_fp64vec2);
-        y7_fp64vec2  = vec_div(x7_fp64vec2,  c_fp64vec2);
-        y8_fp64vec2  = vec_div(x8_fp64vec2,  c_fp64vec2);
-        y9_fp64vec2  = vec_div(x9_fp64vec2,  c_fp64vec2);
-        y10_fp64vec2 = vec_div(x10_fp64vec2, c_fp64vec2);
-        y11_fp64vec2 = vec_div(x11_fp64vec2, c_fp64vec2);
-
-
-        vec_xst(y0_fp64vec2,  0, y+(i   ));
-        vec_xst(y1_fp64vec2,  0, y+(i+2 ));
-        vec_xst(y2_fp64vec2,  0, y+(i+4 ));
-        vec_xst(y3_fp64vec2,  0, y+(i+6 ));
-        vec_xst(y4_fp64vec2,  0, y+(i+8 ));
-        vec_xst(y5_fp64vec2,  0, y+(i+10));
-        vec_xst(y6_fp64vec2,  0, y+(i+12));
-        vec_xst(y7_fp64vec2,  0, y+(i+14));
-        vec_xst(y8_fp64vec2,  0, y+(i+16));
-        vec_xst(y9_fp64vec2,  0, y+(i+18));
-        vec_xst(y10_fp64vec2, 0, y+(i+20));
-        vec_xst(y11_fp64vec2, 0, y+(i+22));
-    }
-    for (; i <= n-8; i += 8)
-    {
-        x0_fp64vec2  = vec_xl(0, x+(i   ));
-        x1_fp64vec2  = vec_xl(0, x+(i+2 ));
-        x2_fp64vec2  = vec_xl(0, x+(i+4 ));
-        x3_fp64vec2  = vec_xl(0, x+(i+6 ));
-
-        y0_fp64vec2  = vec_div(x0_fp64vec2,  c_fp64vec2);
-        y1_fp64vec2  = vec_div(x1_fp64vec2,  c_fp64vec2);
-        y2_fp64vec2  = vec_div(x2_fp64vec2,  c_fp64vec2);
-        y3_fp64vec2  = vec_div(x3_fp64vec2,  c_fp64vec2);
-
-        vec_xst(y0_fp64vec2,  0, y+(i   ));
-        vec_xst(y1_fp64vec2,  0, y+(i+2 ));
-        vec_xst(y2_fp64vec2,  0, y+(i+4 ));
-        vec_xst(y3_fp64vec2,  0, y+(i+6 ));
-
-        vec_xst(y0_fp64vec2,  0, y+(i   ));
-        vec_xst(y1_fp64vec2,  0, y+(i+2 ));
-        vec_xst(y2_fp64vec2,  0, y+(i+4 ));
-        vec_xst(y3_fp64vec2,  0, y+(i+6 ));
-    }
-    for (; i <= n-2; i += 2)
-    {
-        x0_fp64vec2  = vec_xl(0, x+(i   ));
-        y0_fp64vec2  = vec_div(x0_fp64vec2,  c_fp64vec2);
-        vec_xst(y0_fp64vec2,  0, y+(i   ));
-    }
-    for (; i < n; i++)
-        y[i] = x[i] / c;
-}
-
-
-//--------------------------------------------------------------------------------------------------
-// THFloatVector_fill_VSX:
-//--------------------------------------------------------------------------------------------------
-static void THFloatVector_fill_VSX(float *x, const float c, const ptrdiff_t n)
-{
-    ptrdiff_t i;
-
-    float val[4] = {c, c, c, c};
-    vector float fp32vec4 = vec_xl(0, val);
-
-    for (i = 0; i <= n-256; i += 256)
-    {
-        vec_xst(fp32vec4, 0, x+(i    ));
-        vec_xst(fp32vec4, 0, x+(i+4  ));
-        vec_xst(fp32vec4, 0, x+(i+8  ));
-        vec_xst(fp32vec4, 0, x+(i+12 ));
-        vec_xst(fp32vec4, 0, x+(i+16 ));
-        vec_xst(fp32vec4, 0, x+(i+20 ));
-        vec_xst(fp32vec4, 0, x+(i+24 ));
-        vec_xst(fp32vec4, 0, x+(i+28 ));
-        vec_xst(fp32vec4, 0, x+(i+32 ));
-        vec_xst(fp32vec4, 0, x+(i+36 ));
-        vec_xst(fp32vec4, 0, x+(i+40 ));
-        vec_xst(fp32vec4, 0, x+(i+44 ));
-        vec_xst(fp32vec4, 0, x+(i+48 ));
-        vec_xst(fp32vec4, 0, x+(i+52 ));
-        vec_xst(fp32vec4, 0, x+(i+56 ));
-        vec_xst(fp32vec4, 0, x+(i+60 ));
-        vec_xst(fp32vec4, 0, x+(i+64 ));
-        vec_xst(fp32vec4, 0, x+(i+68 ));
-        vec_xst(fp32vec4, 0, x+(i+72 ));
-        vec_xst(fp32vec4, 0, x+(i+76 ));
-        vec_xst(fp32vec4, 0, x+(i+80 ));
-        vec_xst(fp32vec4, 0, x+(i+84 ));
-        vec_xst(fp32vec4, 0, x+(i+88 ));
-        vec_xst(fp32vec4, 0, x+(i+92 ));
-        vec_xst(fp32vec4, 0, x+(i+96 ));
-        vec_xst(fp32vec4, 0, x+(i+100));
-        vec_xst(fp32vec4, 0, x+(i+104));
-        vec_xst(fp32vec4, 0, x+(i+108));
-        vec_xst(fp32vec4, 0, x+(i+112));
-        vec_xst(fp32vec4, 0, x+(i+116));
-        vec_xst(fp32vec4, 0, x+(i+120));
-        vec_xst(fp32vec4, 0, x+(i+124));
-        vec_xst(fp32vec4, 0, x+(i+128));
-        vec_xst(fp32vec4, 0, x+(i+132));
-        vec_xst(fp32vec4, 0, x+(i+136));
-        vec_xst(fp32vec4, 0, x+(i+140));
-        vec_xst(fp32vec4, 0, x+(i+144));
-        vec_xst(fp32vec4, 0, x+(i+148));
-        vec_xst(fp32vec4, 0, x+(i+152));
-        vec_xst(fp32vec4, 0, x+(i+156));
-        vec_xst(fp32vec4, 0, x+(i+160));
-        vec_xst(fp32vec4, 0, x+(i+164));
-        vec_xst(fp32vec4, 0, x+(i+168));
-        vec_xst(fp32vec4, 0, x+(i+172));
-        vec_xst(fp32vec4, 0, x+(i+176));
-        vec_xst(fp32vec4, 0, x+(i+180));
-        vec_xst(fp32vec4, 0, x+(i+184));
-        vec_xst(fp32vec4, 0, x+(i+188));
-        vec_xst(fp32vec4, 0, x+(i+192));
-        vec_xst(fp32vec4, 0, x+(i+196));
-        vec_xst(fp32vec4, 0, x+(i+200));
-        vec_xst(fp32vec4, 0, x+(i+204));
-        vec_xst(fp32vec4, 0, x+(i+208));
-        vec_xst(fp32vec4, 0, x+(i+212));
-        vec_xst(fp32vec4, 0, x+(i+216));
-        vec_xst(fp32vec4, 0, x+(i+220));
-        vec_xst(fp32vec4, 0, x+(i+224));
-        vec_xst(fp32vec4, 0, x+(i+228));
-        vec_xst(fp32vec4, 0, x+(i+232));
-        vec_xst(fp32vec4, 0, x+(i+236));
-        vec_xst(fp32vec4, 0, x+(i+240));
-        vec_xst(fp32vec4, 0, x+(i+244));
-        vec_xst(fp32vec4, 0, x+(i+248));
-        vec_xst(fp32vec4, 0, x+(i+252));
-    }
-    for (; i <= n-32; i += 32)
-    {
-        vec_xst(fp32vec4, 0, x+(i    ));
-        vec_xst(fp32vec4, 0, x+(i+4  ));
-        vec_xst(fp32vec4, 0, x+(i+8  ));
-        vec_xst(fp32vec4, 0, x+(i+12 ));
-        vec_xst(fp32vec4, 0, x+(i+16 ));
-        vec_xst(fp32vec4, 0, x+(i+20 ));
-        vec_xst(fp32vec4, 0, x+(i+24 ));
-        vec_xst(fp32vec4, 0, x+(i+28 ));
-    }
-    for (; i <= n-4; i += 4)
-        vec_xst(fp32vec4, 0, x+(i    ));
-    for (; i < n; i++)
-        x[i] = c;
-}
-
-
-//--------------------------------------------------------------------------------------------------
-// THFloatVector_cadd_VSX:
-//--------------------------------------------------------------------------------------------------
-static void THFloatVector_cadd_VSX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n)
-{
-    ptrdiff_t i;
-
-    float val[4] = {c, c, c, c};
-    vector float c_fp32vec4 = vec_xl(0, val);
-
-    vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4;
-    vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4;
-    vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4;
-    vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4;
-
-
-    for (i = 0; i <= n-48; i += 48)
-    {
-        y0_fp32vec4  = vec_xl(0, y+(i   ));
-        y1_fp32vec4  = vec_xl(0, y+(i+4 ));
-        y2_fp32vec4  = vec_xl(0, y+(i+8 ));
-        y3_fp32vec4  = vec_xl(0, y+(i+12));
-        y4_fp32vec4  = vec_xl(0, y+(i+16 ));
-        y5_fp32vec4  = vec_xl(0, y+(i+20));
-        y6_fp32vec4  = vec_xl(0, y+(i+24));
-        y7_fp32vec4  = vec_xl(0, y+(i+28));
-        y8_fp32vec4  = vec_xl(0, y+(i+32));
-        y9_fp32vec4  = vec_xl(0, y+(i+36));
-        y10_fp32vec4 = vec_xl(0, y+(i+40));
-        y11_fp32vec4 = vec_xl(0, y+(i+44));
-
-        x0_fp32vec4  = vec_xl(0, x+(i   ));
-        x1_fp32vec4  = vec_xl(0, x+(i+4 ));
-        x2_fp32vec4  = vec_xl(0, x+(i+8 ));
-        x3_fp32vec4  = vec_xl(0, x+(i+12 ));
-        x4_fp32vec4  = vec_xl(0, x+(i+16 ));
-        x5_fp32vec4  = vec_xl(0, x+(i+20));
-        x6_fp32vec4  = vec_xl(0, x+(i+24));
-        x7_fp32vec4  = vec_xl(0, x+(i+28));
-        x8_fp32vec4  = vec_xl(0, x+(i+32));
-        x9_fp32vec4  = vec_xl(0, x+(i+36));
-        x10_fp32vec4 = vec_xl(0, x+(i+40));
-        x11_fp32vec4 = vec_xl(0, x+(i+44));
-
-        y0_fp32vec4  = vec_madd(y0_fp32vec4, c_fp32vec4,  x0_fp32vec4);
-        y1_fp32vec4  = vec_madd(y1_fp32vec4, c_fp32vec4, x1_fp32vec4);
-        y2_fp32vec4  = vec_madd(y2_fp32vec4, c_fp32vec4, x2_fp32vec4);
-        y3_fp32vec4  = vec_madd(y3_fp32vec4, c_fp32vec4, x3_fp32vec4);
-        y4_fp32vec4  = vec_madd(y4_fp32vec4, c_fp32vec4, x4_fp32vec4);
-        y5_fp32vec4  = vec_madd(y5_fp32vec4, c_fp32vec4, x5_fp32vec4);
-        y6_fp32vec4  = vec_madd(y6_fp32vec4, c_fp32vec4, x6_fp32vec4);
-        y7_fp32vec4  = vec_madd(y7_fp32vec4, c_fp32vec4, x7_fp32vec4);
-        y8_fp32vec4  = vec_madd(y8_fp32vec4, c_fp32vec4, x8_fp32vec4);
-        y9_fp32vec4  = vec_madd(y9_fp32vec4, c_fp32vec4, x9_fp32vec4);
-        y10_fp32vec4 = vec_madd(y10_fp32vec4, c_fp32vec4, x10_fp32vec4);
-        y11_fp32vec4 = vec_madd(y11_fp32vec4, c_fp32vec4, x11_fp32vec4);
-
-        vec_xst(y0_fp32vec4,  0, z+(i   ));
-        vec_xst(y1_fp32vec4,  0, z+(i+4 ));
-        vec_xst(y2_fp32vec4,  0, z+(i+8 ));
-        vec_xst(y3_fp32vec4,  0, z+(i+12 ));
-        vec_xst(y4_fp32vec4,  0, z+(i+16 ));
-        vec_xst(y5_fp32vec4,  0, z+(i+20));
-        vec_xst(y6_fp32vec4,  0, z+(i+24));
-        vec_xst(y7_fp32vec4,  0, z+(i+28));
-        vec_xst(y8_fp32vec4,  0, z+(i+32));
-        vec_xst(y9_fp32vec4,  0, z+(i+36));
-        vec_xst(y10_fp32vec4, 0, z+(i+40));
-        vec_xst(y11_fp32vec4, 0, z+(i+44));
-    }
-    for (; i <= n-16; i += 16)
-    {
-        y0_fp32vec4  = vec_xl(0, y+(i   ));
-        y1_fp32vec4  = vec_xl(0, y+(i+4 ));
-        y2_fp32vec4  = vec_xl(0, y+(i+8 ));
-        y3_fp32vec4  = vec_xl(0, y+(i+12 ));
-
-        x0_fp32vec4  = vec_xl(0, x+(i   ));
-        x1_fp32vec4  = vec_xl(0, x+(i+4 ));
-        x2_fp32vec4  = vec_xl(0, x+(i+8 ));
-        x3_fp32vec4  = vec_xl(0, x+(i+12 ));
-
-        y0_fp32vec4  = vec_madd(y0_fp32vec4, c_fp32vec4, x0_fp32vec4);
-        y1_fp32vec4  = vec_madd(y1_fp32vec4, c_fp32vec4, x1_fp32vec4);
-        y2_fp32vec4  = vec_madd(y2_fp32vec4, c_fp32vec4, x2_fp32vec4);
-        y3_fp32vec4  = vec_madd(y3_fp32vec4, c_fp32vec4, x3_fp32vec4);
-
-        vec_xst(y0_fp32vec4,  0, z+(i   ));
-        vec_xst(y1_fp32vec4,  0, z+(i+4 ));
-        vec_xst(y2_fp32vec4,  0, z+(i+8 ));
-        vec_xst(y3_fp32vec4,  0, z+(i+12 ));
-    }
-    for (; i <= n-4; i += 4)
-    {
-        y0_fp32vec4  = vec_xl(0, y+(i   ));
-        x0_fp32vec4  = vec_xl(0, x+(i   ));
-        y0_fp32vec4  = vec_madd(y0_fp32vec4, c_fp32vec4, x0_fp32vec4);
-        vec_xst(y0_fp32vec4,  0, z+(i   ));
-    }
-    for (; i < n; i++)
-        z[i] = x[i] + c* y[i];
-}
-
-
-//--------------------------------------------------------------------------------------------------
-// THFloatVector_adds_VSX:
-//--------------------------------------------------------------------------------------------------
-static void THFloatVector_adds_VSX(float *y, const float *x, const float c, const ptrdiff_t n)
-{
-    ptrdiff_t i;
-    float val[4] = {c, c, c, c};
-    vector float c_fp32vec4 = vec_xl(0, val);
-
-    vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4;
-    vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4;
-    vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4;
-    vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4;
-
-
-    for (i = 0; i <= n-48; i += 48)
-    {
-        x0_fp32vec4  = vec_xl(0, x+(i   ));
-        x1_fp32vec4  = vec_xl(0, x+(i+4 ));
-        x2_fp32vec4  = vec_xl(0, x+(i+8 ));
-        x3_fp32vec4  = vec_xl(0, x+(i+12));
-        x4_fp32vec4  = vec_xl(0, x+(i+16));
-        x5_fp32vec4  = vec_xl(0, x+(i+20));
-        x6_fp32vec4  = vec_xl(0, x+(i+24));
-        x7_fp32vec4  = vec_xl(0, x+(i+28));
-        x8_fp32vec4  = vec_xl(0, x+(i+32));
-        x9_fp32vec4  = vec_xl(0, x+(i+36));
-        x10_fp32vec4 = vec_xl(0, x+(i+40));
-        x11_fp32vec4 = vec_xl(0, x+(i+44));
-
-        y0_fp32vec4  = vec_add(x0_fp32vec4,  c_fp32vec4);
-        y1_fp32vec4  = vec_add(x1_fp32vec4,  c_fp32vec4);
-        y2_fp32vec4  = vec_add(x2_fp32vec4,  c_fp32vec4);
-        y3_fp32vec4  = vec_add(x3_fp32vec4,  c_fp32vec4);
-        y4_fp32vec4  = vec_add(x4_fp32vec4,  c_fp32vec4);
-        y5_fp32vec4  = vec_add(x5_fp32vec4,  c_fp32vec4);
-        y6_fp32vec4  = vec_add(x6_fp32vec4,  c_fp32vec4);
-        y7_fp32vec4  = vec_add(x7_fp32vec4,  c_fp32vec4);
-        y8_fp32vec4  = vec_add(x8_fp32vec4,  c_fp32vec4);
-        y9_fp32vec4  = vec_add(x9_fp32vec4,  c_fp32vec4);
-        y10_fp32vec4 = vec_add(x10_fp32vec4, c_fp32vec4);
-        y11_fp32vec4 = vec_add(x11_fp32vec4, c_fp32vec4);
-
-        vec_xst(y0_fp32vec4,  0, y+(i   ));
-        vec_xst(y1_fp32vec4,  0, y+(i+4 ));
-        vec_xst(y2_fp32vec4,  0, y+(i+8 ));
-        vec_xst(y3_fp32vec4,  0, y+(i+12));
-        vec_xst(y4_fp32vec4,  0, y+(i+16));
-        vec_xst(y5_fp32vec4,  0, y+(i+20));
-        vec_xst(y6_fp32vec4,  0, y+(i+24));
-        vec_xst(y7_fp32vec4,  0, y+(i+28));
-        vec_xst(y8_fp32vec4,  0, y+(i+32));
-        vec_xst(y9_fp32vec4,  0, y+(i+36));
-        vec_xst(y10_fp32vec4, 0, y+(i+40));
-        vec_xst(y11_fp32vec4, 0, y+(i+44));
-    }
-    for (; i <= n-16; i += 16)
-    {
-        x0_fp32vec4  = vec_xl(0, x+(i   ));
-        x1_fp32vec4  = vec_xl(0, x+(i+4 ));
-        x2_fp32vec4  = vec_xl(0, x+(i+8 ));
-        x3_fp32vec4  = vec_xl(0, x+(i+12));
-
-        y0_fp32vec4  = vec_add(x0_fp32vec4,  c_fp32vec4);
-        y1_fp32vec4  = vec_add(x1_fp32vec4,  c_fp32vec4);
-        y2_fp32vec4  = vec_add(x2_fp32vec4,  c_fp32vec4);
-        y3_fp32vec4  = vec_add(x3_fp32vec4,  c_fp32vec4);
-
-        vec_xst(y0_fp32vec4,  0, y+(i   ));
-        vec_xst(y1_fp32vec4,  0, y+(i+4 ));
-        vec_xst(y2_fp32vec4,  0, y+(i+8 ));
-        vec_xst(y3_fp32vec4,  0, y+(i+12));
-    }
-    for (; i <= n-4; i += 4)
-    {
-        x0_fp32vec4  = vec_xl(0, x+(i   ));
-        y0_fp32vec4  = vec_add(x0_fp32vec4,  c_fp32vec4);
-        vec_xst(y0_fp32vec4,  0, y+(i   ));
-    }
-    for (; i < n; i++)
-        y[i] = c + x[i];
-}
-
-
-//--------------------------------------------------------------------------------------------------
-// THFloatVector_cmul_VSX:
-//--------------------------------------------------------------------------------------------------
-static void THFloatVector_cmul_VSX(float *z, const float *y, const float *x, const ptrdiff_t n)
-{
-    ptrdiff_t i;
-
-    vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4;
-    vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4;
-    vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4;
-    vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4;
-
-
-    for (i = 0; i <= n-48; i += 48)
-    {
-        y0_fp32vec4  = vec_xl(0, y+(i   ));
-        y1_fp32vec4  = vec_xl(0, y+(i+4 ));
-        y2_fp32vec4  = vec_xl(0, y+(i+8 ));
-        y3_fp32vec4  = vec_xl(0, y+(i+12 ));
-        y4_fp32vec4  = vec_xl(0, y+(i+16 ));
-        y5_fp32vec4  = vec_xl(0, y+(i+20));
-        y6_fp32vec4  = vec_xl(0, y+(i+24));
-        y7_fp32vec4  = vec_xl(0, y+(i+28));
-        y8_fp32vec4  = vec_xl(0, y+(i+32));
-        y9_fp32vec4  = vec_xl(0, y+(i+36));
-        y10_fp32vec4 = vec_xl(0, y+(i+40));
-        y11_fp32vec4 = vec_xl(0, y+(i+44));
-
-        x0_fp32vec4  = vec_xl(0, x+(i   ));
-        x1_fp32vec4  = vec_xl(0, x+(i+4 ));
-        x2_fp32vec4  = vec_xl(0, x+(i+8 ));
-        x3_fp32vec4  = vec_xl(0, x+(i+12 ));
-        x4_fp32vec4  = vec_xl(0, x+(i+16 ));
-        x5_fp32vec4  = vec_xl(0, x+(i+20));
-        x6_fp32vec4  = vec_xl(0, x+(i+24));
-        x7_fp32vec4  = vec_xl(0, x+(i+28));
-        x8_fp32vec4  = vec_xl(0, x+(i+32));
-        x9_fp32vec4  = vec_xl(0, x+(i+36));
-        x10_fp32vec4 = vec_xl(0, x+(i+40));
-        x11_fp32vec4 = vec_xl(0, x+(i+44));
-
-        y0_fp32vec4  = vec_mul(y0_fp32vec4,  x0_fp32vec4);
-        y1_fp32vec4  = vec_mul(y1_fp32vec4,  x1_fp32vec4);
-        y2_fp32vec4  = vec_mul(y2_fp32vec4,  x2_fp32vec4);
-        y3_fp32vec4  = vec_mul(y3_fp32vec4,  x3_fp32vec4);
-        y4_fp32vec4  = vec_mul(y4_fp32vec4,  x4_fp32vec4);
-        y5_fp32vec4  = vec_mul(y5_fp32vec4,  x5_fp32vec4);
-        y6_fp32vec4  = vec_mul(y6_fp32vec4,  x6_fp32vec4);
-        y7_fp32vec4  = vec_mul(y7_fp32vec4,  x7_fp32vec4);
-        y8_fp32vec4  = vec_mul(y8_fp32vec4,  x8_fp32vec4);
-        y9_fp32vec4  = vec_mul(y9_fp32vec4,  x9_fp32vec4);
-        y10_fp32vec4 = vec_mul(y10_fp32vec4, x10_fp32vec4);
-        y11_fp32vec4 = vec_mul(y11_fp32vec4, x11_fp32vec4);
-
-        vec_xst(y0_fp32vec4,  0, z+(i   ));
-        vec_xst(y1_fp32vec4,  0, z+(i+4 ));
-        vec_xst(y2_fp32vec4,  0, z+(i+8 ));
-        vec_xst(y3_fp32vec4,  0, z+(i+12 ));
-        vec_xst(y4_fp32vec4,  0, z+(i+16 ));
-        vec_xst(y5_fp32vec4,  0, z+(i+20));
-        vec_xst(y6_fp32vec4,  0, z+(i+24));
-        vec_xst(y7_fp32vec4,  0, z+(i+28));
-        vec_xst(y8_fp32vec4,  0, z+(i+32));
-        vec_xst(y9_fp32vec4,  0, z+(i+36));
-        vec_xst(y10_fp32vec4, 0, z+(i+40));
-        vec_xst(y11_fp32vec4, 0, z+(i+44));
-    }
-    for (; i <= n-16; i += 16)
-    {
-        y0_fp32vec4  = vec_xl(0, y+(i   ));
-        y1_fp32vec4  = vec_xl(0, y+(i+4 ));
-        y2_fp32vec4  = vec_xl(0, y+(i+8 ));
-        y3_fp32vec4  = vec_xl(0, y+(i+12 ));
-
-        x0_fp32vec4  = vec_xl(0, x+(i   ));
-        x1_fp32vec4  = vec_xl(0, x+(i+4 ));
-        x2_fp32vec4  = vec_xl(0, x+(i+8 ));
-        x3_fp32vec4  = vec_xl(0, x+(i+12 ));
-
-        y0_fp32vec4  = vec_mul(y0_fp32vec4,  x0_fp32vec4);
-        y1_fp32vec4  = vec_mul(y1_fp32vec4,  x1_fp32vec4);
-        y2_fp32vec4  = vec_mul(y2_fp32vec4,  x2_fp32vec4);
-        y3_fp32vec4  = vec_mul(y3_fp32vec4,  x3_fp32vec4);
-
-        vec_xst(y0_fp32vec4,  0, z+(i   ));
-        vec_xst(y1_fp32vec4,  0, z+(i+4 ));
-        vec_xst(y2_fp32vec4,  0, z+(i+8 ));
-        vec_xst(y3_fp32vec4,  0, z+(i+12 ));
-    }
-    for (; i <= n-4; i += 4)
-    {
-        y0_fp32vec4  = vec_xl(0, y+(i   ));
-        x0_fp32vec4  = vec_xl(0, x+(i   ));
-        y0_fp32vec4  = vec_mul(y0_fp32vec4,  x0_fp32vec4);
-        vec_xst(y0_fp32vec4,  0, z+(i   ));
-    }
-    for (; i < n; i++)
-        z[i] = y[i] * x[i];
-}
-
-
-//--------------------------------------------------------------------------------------------------
-// THFloatVector_muls_VSX:
-//--------------------------------------------------------------------------------------------------
-static void THFloatVector_muls_VSX(float *y, const float *x, const float c, const ptrdiff_t n)
-{
-    ptrdiff_t i;
-    float val[4] = {c, c, c, c};
-    vector float c_fp32vec4 = vec_xl(0, val);
-
-    vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4;
-    vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4;
-    vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4;
-    vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4;
-
-
-    for (i = 0; i <= n-48; i += 48)
-    {
-        x0_fp32vec4  = vec_xl(0, x+(i   ));
-        x1_fp32vec4  = vec_xl(0, x+(i+4 ));
-        x2_fp32vec4  = vec_xl(0, x+(i+8 ));
-        x3_fp32vec4  = vec_xl(0, x+(i+12));
-        x4_fp32vec4  = vec_xl(0, x+(i+16));
-        x5_fp32vec4  = vec_xl(0, x+(i+20));
-        x6_fp32vec4  = vec_xl(0, x+(i+24));
-        x7_fp32vec4  = vec_xl(0, x+(i+28));
-        x8_fp32vec4  = vec_xl(0, x+(i+32));
-        x9_fp32vec4  = vec_xl(0, x+(i+36));
-        x10_fp32vec4 = vec_xl(0, x+(i+40));
-        x11_fp32vec4 = vec_xl(0, x+(i+44));
-
-        y0_fp32vec4  = vec_mul(x0_fp32vec4,  c_fp32vec4);
-        y1_fp32vec4  = vec_mul(x1_fp32vec4,  c_fp32vec4);
-        y2_fp32vec4  = vec_mul(x2_fp32vec4,  c_fp32vec4);
-        y3_fp32vec4  = vec_mul(x3_fp32vec4,  c_fp32vec4);
-        y4_fp32vec4  = vec_mul(x4_fp32vec4,  c_fp32vec4);
-        y5_fp32vec4  = vec_mul(x5_fp32vec4,  c_fp32vec4);
-        y6_fp32vec4  = vec_mul(x6_fp32vec4,  c_fp32vec4);
-        y7_fp32vec4  = vec_mul(x7_fp32vec4,  c_fp32vec4);
-        y8_fp32vec4  = vec_mul(x8_fp32vec4,  c_fp32vec4);
-        y9_fp32vec4  = vec_mul(x9_fp32vec4,  c_fp32vec4);
-        y10_fp32vec4 = vec_mul(x10_fp32vec4, c_fp32vec4);
-        y11_fp32vec4 = vec_mul(x11_fp32vec4, c_fp32vec4);
-
-        vec_xst(y0_fp32vec4,  0, y+(i   ));
-        vec_xst(y1_fp32vec4,  0, y+(i+4 ));
-        vec_xst(y2_fp32vec4,  0, y+(i+8 ));
-        vec_xst(y3_fp32vec4,  0, y+(i+12));
-        vec_xst(y4_fp32vec4,  0, y+(i+16));
-        vec_xst(y5_fp32vec4,  0, y+(i+20));
-        vec_xst(y6_fp32vec4,  0, y+(i+24));
-        vec_xst(y7_fp32vec4,  0, y+(i+28));
-        vec_xst(y8_fp32vec4,  0, y+(i+32));
-        vec_xst(y9_fp32vec4,  0, y+(i+36));
-        vec_xst(y10_fp32vec4, 0, y+(i+40));
-        vec_xst(y11_fp32vec4, 0, y+(i+44));
-    }
-    for (; i <= n-16; i += 16)
-    {
-        x0_fp32vec4  = vec_xl(0, x+(i   ));
-        x1_fp32vec4  = vec_xl(0, x+(i+4 ));
-        x2_fp32vec4  = vec_xl(0, x+(i+8 ));
-        x3_fp32vec4  = vec_xl(0, x+(i+12));
-
-        y0_fp32vec4  = vec_mul(x0_fp32vec4,  c_fp32vec4);
-        y1_fp32vec4  = vec_mul(x1_fp32vec4,  c_fp32vec4);
-        y2_fp32vec4  = vec_mul(x2_fp32vec4,  c_fp32vec4);
-        y3_fp32vec4  = vec_mul(x3_fp32vec4,  c_fp32vec4);
-
-        vec_xst(y0_fp32vec4,  0, y+(i   ));
-        vec_xst(y1_fp32vec4,  0, y+(i+4 ));
-        vec_xst(y2_fp32vec4,  0, y+(i+8 ));
-        vec_xst(y3_fp32vec4,  0, y+(i+12));
-    }
-    for (; i <= n-4; i += 4)
-    {
-        x0_fp32vec4  = vec_xl(0, x+(i   ));
-        y0_fp32vec4  = vec_mul(x0_fp32vec4,  c_fp32vec4);
-        vec_xst(y0_fp32vec4,  0, y+(i   ));
-    }
-    for (; i < n; i++)
-        y[i] = c * x[i];
-}
-
-
-//--------------------------------------------------------------------------------------------------
-// THFloatVector_cdiv_VSX:
-//--------------------------------------------------------------------------------------------------
-static void THFloatVector_cdiv_VSX(float *z, const float *x, const float *y, const ptrdiff_t n)
-{
-    ptrdiff_t i;
-
-    vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4;
-    vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4;
-    vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4;
-    vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4;
-
-
-    for (i = 0; i <= n-48; i += 48)
-    {
-        y0_fp32vec4  = vec_xl(0, y+(i   ));
-        y1_fp32vec4  = vec_xl(0, y+(i+4));
-        y2_fp32vec4  = vec_xl(0, y+(i+8));
-        y3_fp32vec4  = vec_xl(0, y+(i+12));
-        y4_fp32vec4  = vec_xl(0, y+(i+16));
-        y5_fp32vec4  = vec_xl(0, y+(i+20));
-        y6_fp32vec4  = vec_xl(0, y+(i+24));
-        y7_fp32vec4  = vec_xl(0, y+(i+28));
-        y8_fp32vec4  = vec_xl(0, y+(i+32));
-        y9_fp32vec4  = vec_xl(0, y+(i+36));
-        y10_fp32vec4 = vec_xl(0, y+(i+40));
-        y11_fp32vec4 = vec_xl(0, y+(i+44));
-
-        x0_fp32vec4  = vec_xl(0, x+(i   ));
-        x1_fp32vec4  = vec_xl(0, x+(i+4 ));
-        x2_fp32vec4  = vec_xl(0, x+(i+8 ));
-        x3_fp32vec4  = vec_xl(0, x+(i+12 ));
-        x4_fp32vec4  = vec_xl(0, x+(i+16 ));
-        x5_fp32vec4  = vec_xl(0, x+(i+20));
-        x6_fp32vec4  = vec_xl(0, x+(i+24));
-        x7_fp32vec4  = vec_xl(0, x+(i+28));
-        x8_fp32vec4  = vec_xl(0, x+(i+32));
-        x9_fp32vec4  = vec_xl(0, x+(i+36));
-        x10_fp32vec4 = vec_xl(0, x+(i+40));
-        x11_fp32vec4 = vec_xl(0, x+(i+44));
-
-        y0_fp32vec4  = vec_div(x0_fp32vec4,  y0_fp32vec4);
-        y1_fp32vec4  = vec_div(x1_fp32vec4,  y1_fp32vec4);
-        y2_fp32vec4  = vec_div(x2_fp32vec4,  y2_fp32vec4);
-        y3_fp32vec4  = vec_div(x3_fp32vec4,  y3_fp32vec4);
-        y4_fp32vec4  = vec_div(x4_fp32vec4,  y4_fp32vec4);
-        y5_fp32vec4  = vec_div(x5_fp32vec4,  y5_fp32vec4);
-        y6_fp32vec4  = vec_div(x6_fp32vec4,  y6_fp32vec4);
-        y7_fp32vec4  = vec_div(x7_fp32vec4,  y7_fp32vec4);
-        y8_fp32vec4  = vec_div(x8_fp32vec4,  y8_fp32vec4);
-        y9_fp32vec4  = vec_div(x9_fp32vec4,  y9_fp32vec4);
-        y10_fp32vec4 = vec_div(x10_fp32vec4, y10_fp32vec4);
-        y11_fp32vec4 = vec_div(x11_fp32vec4, y11_fp32vec4);
-
-        vec_xst(y0_fp32vec4,  0, z+(i   ));
-        vec_xst(y1_fp32vec4,  0, z+(i+4 ));
-        vec_xst(y2_fp32vec4,  0, z+(i+8 ));
-        vec_xst(y3_fp32vec4,  0, z+(i+12 ));
-        vec_xst(y4_fp32vec4,  0, z+(i+16 ));
-        vec_xst(y5_fp32vec4,  0, z+(i+20));
-        vec_xst(y6_fp32vec4,  0, z+(i+24));
-        vec_xst(y7_fp32vec4,  0, z+(i+28));
-        vec_xst(y8_fp32vec4,  0, z+(i+32));
-        vec_xst(y9_fp32vec4,  0, z+(i+36));
-        vec_xst(y10_fp32vec4, 0, z+(i+40));
-        vec_xst(y11_fp32vec4, 0, z+(i+44));
-    }
-    for (; i <= n-16; i += 16)
-    {
-        y0_fp32vec4  = vec_xl(0, y+(i   ));
-        y1_fp32vec4  = vec_xl(0, y+(i+4 ));
-        y2_fp32vec4  = vec_xl(0, y+(i+8 ));
-        y3_fp32vec4  = vec_xl(0, y+(i+12 ));
-
-        x0_fp32vec4  = vec_xl(0, x+(i   ));
-        x1_fp32vec4  = vec_xl(0, x+(i+4 ));
-        x2_fp32vec4  = vec_xl(0, x+(i+8 ));
-        x3_fp32vec4  = vec_xl(0, x+(i+12 ));
-
-        y0_fp32vec4  = vec_div(x0_fp32vec4,  y0_fp32vec4);
-        y1_fp32vec4  = vec_div(x1_fp32vec4,  y1_fp32vec4);
-        y2_fp32vec4  = vec_div(x2_fp32vec4,  y2_fp32vec4);
-        y3_fp32vec4  = vec_div(x3_fp32vec4,  y3_fp32vec4);
-
-        vec_xst(y0_fp32vec4,  0, z+(i   ));
-        vec_xst(y1_fp32vec4,  0, z+(i+4 ));
-        vec_xst(y2_fp32vec4,  0, z+(i+8 ));
-        vec_xst(y3_fp32vec4,  0, z+(i+12 ));
-    }
-    for (; i <= n-4; i += 4)
-    {
-        y0_fp32vec4  = vec_xl(0, y+(i   ));
-        x0_fp32vec4  = vec_xl(0, x+(i   ));
-        y0_fp32vec4  = vec_div(x0_fp32vec4,  y0_fp32vec4);
-        vec_xst(y0_fp32vec4,  0, z+(i   ));
-    }
-    for (; i < n; i++)
-        z[i] = x[i] / y[i];
-}
-
-
-//--------------------------------------------------------------------------------------------------
-// THFloatVector_divs_VSX:
-//--------------------------------------------------------------------------------------------------
-static void THFloatVector_divs_VSX(float *y, const float*x, const float c, const ptrdiff_t n)
-{
-    ptrdiff_t i;
-
-    float val[4] = {c, c, c, c};
-    vector float c_fp64vec2 = vec_xl(0, val);
-
-    vector float y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
-    vector float y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
-    vector float x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
-    vector float x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
-
-
-    for (i = 0; i <= n-48; i += 48)
-    {
-        x0_fp64vec2  = vec_xl(0, x+(i   ));
-        x1_fp64vec2  = vec_xl(0, x+(i+4 ));
-        x2_fp64vec2  = vec_xl(0, x+(i+8 ));
-        x3_fp64vec2  = vec_xl(0, x+(i+12 ));
-        x4_fp64vec2  = vec_xl(0, x+(i+16 ));
-        x5_fp64vec2  = vec_xl(0, x+(i+20));
-        x6_fp64vec2  = vec_xl(0, x+(i+24));
-        x7_fp64vec2  = vec_xl(0, x+(i+28));
-        x8_fp64vec2  = vec_xl(0, x+(i+32));
-        x9_fp64vec2  = vec_xl(0, x+(i+36));
-        x10_fp64vec2 = vec_xl(0, x+(i+40));
-        x11_fp64vec2 = vec_xl(0, x+(i+44));
-
-        y0_fp64vec2  = vec_div(x0_fp64vec2,  c_fp64vec2);
-        y1_fp64vec2  = vec_div(x1_fp64vec2,  c_fp64vec2);
-        y2_fp64vec2  = vec_div(x2_fp64vec2,  c_fp64vec2);
-        y3_fp64vec2  = vec_div(x3_fp64vec2,  c_fp64vec2);
-        y4_fp64vec2  = vec_div(x4_fp64vec2,  c_fp64vec2);
-        y5_fp64vec2  = vec_div(x5_fp64vec2,  c_fp64vec2);
-        y6_fp64vec2  = vec_div(x6_fp64vec2,  c_fp64vec2);
-        y7_fp64vec2  = vec_div(x7_fp64vec2,  c_fp64vec2);
-        y8_fp64vec2  = vec_div(x8_fp64vec2,  c_fp64vec2);
-        y9_fp64vec2  = vec_div(x9_fp64vec2,  c_fp64vec2);
-        y10_fp64vec2 = vec_div(x10_fp64vec2, c_fp64vec2);
-        y11_fp64vec2 = vec_div(x11_fp64vec2, c_fp64vec2);
-
-
-        vec_xst(y0_fp64vec2,  0, y+(i   ));
-        vec_xst(y1_fp64vec2,  0, y+(i+4 ));
-        vec_xst(y2_fp64vec2,  0, y+(i+8 ));
-        vec_xst(y3_fp64vec2,  0, y+(i+12 ));
-        vec_xst(y4_fp64vec2,  0, y+(i+16 ));
-        vec_xst(y5_fp64vec2,  0, y+(i+20));
-        vec_xst(y6_fp64vec2,  0, y+(i+24));
-        vec_xst(y7_fp64vec2,  0, y+(i+28));
-        vec_xst(y8_fp64vec2,  0, y+(i+32));
-        vec_xst(y9_fp64vec2,  0, y+(i+36));
-        vec_xst(y10_fp64vec2, 0, y+(i+40));
-        vec_xst(y11_fp64vec2, 0, y+(i+44));
-    }
-    for (; i <= n-16; i += 16)
-    {
-        x0_fp64vec2  = vec_xl(0, x+(i   ));
-        x1_fp64vec2  = vec_xl(0, x+(i+4 ));
-        x2_fp64vec2  = vec_xl(0, x+(i+8 ));
-        x3_fp64vec2  = vec_xl(0, x+(i+12 ));
-
-        y0_fp64vec2  = vec_div(x0_fp64vec2,  c_fp64vec2);
-        y1_fp64vec2  = vec_div(x1_fp64vec2,  c_fp64vec2);
-        y2_fp64vec2  = vec_div(x2_fp64vec2,  c_fp64vec2);
-        y3_fp64vec2  = vec_div(x3_fp64vec2,  c_fp64vec2);
-
-        vec_xst(y0_fp64vec2,  0, y+(i   ));
-        vec_xst(y1_fp64vec2,  0, y+(i+4 ));
-        vec_xst(y2_fp64vec2,  0, y+(i+8 ));
-        vec_xst(y3_fp64vec2,  0, y+(i+12 ));
-
-        vec_xst(y0_fp64vec2,  0, y+(i   ));
-        vec_xst(y1_fp64vec2,  0, y+(i+4 ));
-        vec_xst(y2_fp64vec2,  0, y+(i+8 ));
-        vec_xst(y3_fp64vec2,  0, y+(i+16 ));
-    }
-    for (; i <= n-4; i += 4)
-    {
-        x0_fp64vec2  = vec_xl(0, x+(i   ));
-        y0_fp64vec2  = vec_div(x0_fp64vec2,  c_fp64vec2);
-        vec_xst(y0_fp64vec2,  0, y+(i   ));
-    }
-    for (; i < n; i++)
-        y[i] = x[i] / c;
-}
-
-
-//------------------------------------------------
-//
-// Testing for correctness and performance
-//
-// If you want to run these tests, compile this
-// file with -DRUN_VSX_TESTS on a Power machine,
-// and then run the executable that is generated.
-//
-//------------------------------------------------
-//
-// Example passing run (from a Power8 machine):
-//
-//    $ gcc VSX.c -O2 -D RUN_VSX_TESTS -o vsxtest
-//    $ ./vsxtest
-//
-//	TODO
-//
-//
-//    Finished runnning all tests. All tests PASSED.
-//
-//------------------------------------------------
-#ifdef RUN_VSX_TESTS
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include <assert.h>
-#include <math.h>
-
-#define VSX_PERF_NUM_TEST_ELEMENTS 100000000
-#define VSX_FUNC_NUM_TEST_ELEMENTS 2507
-
-
-//--------------------------------------------------------------------------------------------------
-// Standard implementations:
-//--------------------------------------------------------------------------------------------------
-static void standardDouble_fill(double *x, const double c, const ptrdiff_t n)
-{
-    for (ptrdiff_t i = 0; i < n; i++)
-        x[i] = c;
-}
-
-static void standardFloat_fill(float *x, const float c, const ptrdiff_t n)
-{
-    for (ptrdiff_t i = 0; i < n; i++)
-        x[i] = c;
-}
-
-static void standardDouble_cadd(double *z, const double *x,  const double *y, const double c, const ptrdiff_t n)
-{
-  for (ptrdiff_t i = 0; i < n; i++)
-    z[i] = x[i] + c * y[i];
-}
-
-static void standardFloat_cadd(float *z, const float *x, const float *y, const float c, const ptrdiff_t n)
-{
-  for (ptrdiff_t i = 0; i < n; i++)
-    z[i] = x[i] + c * y[i];
-}
-
-static void standardDouble_adds(double *y, const double *x, const double c, const ptrdiff_t n)
-{
-  for (ptrdiff_t i = 0; i < n; i++)
-    y[i] = c + x[i];
-}
-
-static void standardFloat_adds(float *y, const float *x, const float c, const ptrdiff_t n)
-{
-  for (ptrdiff_t i = 0; i < n; i++)
-    y[i] = c + x[i];
-}
-
-static void standardDouble_cmul(double *z, const double *x,  const double *y, const ptrdiff_t n)
-{
-  for (ptrdiff_t i = 0; i < n; i++)
-    z[i] = x[i] * y[i];
-}
-
-static void standardFloat_cmul(float *z, const float *x, const float *y, const ptrdiff_t n)
-{
-  for (ptrdiff_t i = 0; i < n; i++)
-    z[i] = x[i] * y[i];
-}
-
-static void standardDouble_muls(double *y, const double *x, const double c, const ptrdiff_t n)
-{
-  for (ptrdiff_t i = 0; i < n; i++)
-    y[i] = c * x[i];
-}
-
-static void standardFloat_muls(float *y, const float *x, const float c, const ptrdiff_t n)
-{
-  for (ptrdiff_t i = 0; i < n; i++)
-    y[i] = c * x[i];
-}
-
-static void standardDouble_cdiv(double *z, const double *x,  const double *y, const ptrdiff_t n)
-{
-  for (ptrdiff_t i = 0; i < n; i++)
-    z[i] = x[i] / y[i];
-}
-
-static void standardFloat_cdiv(float *z, const float *x, const float *y, const ptrdiff_t n)
-{
-  for (ptrdiff_t i = 0; i < n; i++)
-    z[i] = x[i] / y[i];
-}
-
-static void standardDouble_divs(double *y, const double *x, const double c, const ptrdiff_t n)
-{
-  for (ptrdiff_t i = 0; i < n; i++)
-    y[i] = x[i] / c;
-}
-
-static void standardFloat_divs(float *y, const float *x, const float c, const ptrdiff_t n)
-{
-  for (ptrdiff_t i = 0; i < n; i++)
-    y[i] = x[i] / c;
-}
-
-double randDouble()
-{
-    return (double)(rand()%100)/(double)(rand()%100) * (rand()%2 ? -1.0 : 1.0);
-}
-
-int near(double a, double b)
-{
-    int aClass = fpclassify(a);
-    int bClass = fpclassify(b);
-
-    if(aClass != bClass)             // i.e. is it NAN, infinite, or finite...?
-        return 0;
-
-    if(aClass == FP_INFINITE)       // if it is infinite, the sign must be the same, i.e. positive infinity is not near negative infinity
-        return (signbit(a) == signbit(b));
-    else if(aClass == FP_NORMAL)    // if it is a normal number then check the magnitude of the difference between the numbers
-        return fabs(a - b) < 0.001;
-    else                            // if both number are of the same class as each other and are of any other class (i.e. such as NAN), then they are near to each other.
-        return 1;
-}
-
-
-//--------------------------------------------------------------------------------------------------
-// Standard tests:
-//--------------------------------------------------------------------------------------------------
-void test_THDoubleVector_fill_VSX()
-{
-    clock_t start, end;
-    double elapsedSeconds_optimized, elapsedSeconds_standard;
-
-    double *x_standard  = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
-    double *x_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
-
-    double yVal0 = 17.2;
-    double yVal1 = 8.2;
-    double yVal2 = 5.1;
-    double yVal3 = -0.9;
-
-    //-------------------------------------------------
-    // Performance Test
-    //-------------------------------------------------
-    start = clock();
-    standardDouble_fill(x_standard, yVal0, VSX_PERF_NUM_TEST_ELEMENTS  );
-    standardDouble_fill(x_standard, yVal1, VSX_PERF_NUM_TEST_ELEMENTS-1);
-    standardDouble_fill(x_standard, yVal2, VSX_PERF_NUM_TEST_ELEMENTS-2);
-    standardDouble_fill(x_standard, yVal3, VSX_PERF_NUM_TEST_ELEMENTS-3);
-    end = clock();
-
-    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
-    printf("standardDouble_fill() test took %.5lf seconds\n", elapsedSeconds_standard);
-
-    start = clock();
-    THDoubleVector_fill_VSX(x_optimized, yVal0, VSX_PERF_NUM_TEST_ELEMENTS  );
-    THDoubleVector_fill_VSX(x_optimized, yVal1, VSX_PERF_NUM_TEST_ELEMENTS-1);
-    THDoubleVector_fill_VSX(x_optimized, yVal2, VSX_PERF_NUM_TEST_ELEMENTS-2);
-    THDoubleVector_fill_VSX(x_optimized, yVal3, VSX_PERF_NUM_TEST_ELEMENTS-3);
-    end = clock();
-
-    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
-    printf("THDoubleVector_fill_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
-
-
-    //-------------------------------------------------
-    // Correctness Test
-    //-------------------------------------------------
-    yVal0 += 1.0;
-    yVal1 += 1.0;
-    yVal2 += 1.0;
-    yVal3 -= 1.0;
-
-    standardDouble_fill(    x_standard,  yVal0, VSX_FUNC_NUM_TEST_ELEMENTS);
-    THDoubleVector_fill_VSX(x_optimized, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS);
-    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
-        assert(x_optimized[i] == yVal0);
-
-    standardDouble_fill(    x_standard+1,  yVal1, VSX_FUNC_NUM_TEST_ELEMENTS-2);
-    THDoubleVector_fill_VSX(x_optimized+1, yVal1, VSX_FUNC_NUM_TEST_ELEMENTS-2);
-    standardDouble_fill(    x_standard+2,  yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-4);
-    THDoubleVector_fill_VSX(x_optimized+2, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-4);
-    standardDouble_fill(    x_standard+3,  yVal3, VSX_FUNC_NUM_TEST_ELEMENTS-6);
-    THDoubleVector_fill_VSX(x_optimized+3, yVal3, VSX_FUNC_NUM_TEST_ELEMENTS-6);
-    standardDouble_fill(    x_standard+517,  yVal0, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
-    THDoubleVector_fill_VSX(x_optimized+517, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
-    int r = rand() % 258;
-    standardDouble_fill(    x_standard+517+r,  yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
-    THDoubleVector_fill_VSX(x_optimized+517+r, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
-    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
-        assert(x_optimized[i] == x_standard[i]);
-    printf("All assertions PASSED for THDoubleVector_fill_VSX() test.\n\n");
-
-
-    free(x_standard);
-    free(x_optimized);
-}
-
-
-void test_THFloatVector_fill_VSX()
-{
-    clock_t start, end;
-    double elapsedSeconds_optimized, elapsedSeconds_standard;
-
-    float *x_standard  = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
-    float *x_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
-
-    float yVal0 = 17.2;
-    float yVal1 = 8.2;
-    float yVal2 = 5.1;
-    float yVal3 = -0.9;
-
-    //-------------------------------------------------
-    // Performance Test
-    //-------------------------------------------------
-    start = clock();
-    standardFloat_fill(x_standard, yVal0, VSX_PERF_NUM_TEST_ELEMENTS  );
-    standardFloat_fill(x_standard, yVal1, VSX_PERF_NUM_TEST_ELEMENTS-1);
-    standardFloat_fill(x_standard, yVal2, VSX_PERF_NUM_TEST_ELEMENTS-2);
-    standardFloat_fill(x_standard, yVal3, VSX_PERF_NUM_TEST_ELEMENTS-3);
-    end = clock();
-
-    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
-    printf("standardFloat_fill() test took %.5lf seconds\n", elapsedSeconds_standard);
-
-    start = clock();
-    THFloatVector_fill_VSX(x_optimized, yVal0, VSX_PERF_NUM_TEST_ELEMENTS  );
-    THFloatVector_fill_VSX(x_optimized, yVal1, VSX_PERF_NUM_TEST_ELEMENTS-1);
-    THFloatVector_fill_VSX(x_optimized, yVal2, VSX_PERF_NUM_TEST_ELEMENTS-2);
-    THFloatVector_fill_VSX(x_optimized, yVal3, VSX_PERF_NUM_TEST_ELEMENTS-3);
-    end = clock();
-
-    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
-    printf("THFloatVector_fill_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
-
-
-    //-------------------------------------------------
-    // Correctness Test
-    //-------------------------------------------------
-    yVal0 += 1.0;
-    yVal1 += 1.0;
-    yVal2 += 1.0;
-    yVal3 -= 1.0;
-
-    standardFloat_fill(    x_standard,  yVal0, VSX_FUNC_NUM_TEST_ELEMENTS);
-    THFloatVector_fill_VSX(x_optimized, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS);
-    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
-        assert(x_optimized[i] == yVal0);
-
-    standardFloat_fill(    x_standard+1,  yVal1, VSX_FUNC_NUM_TEST_ELEMENTS-2);
-    THFloatVector_fill_VSX(x_optimized+1, yVal1, VSX_FUNC_NUM_TEST_ELEMENTS-2);
-    standardFloat_fill(    x_standard+2,  yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-4);
-    THFloatVector_fill_VSX(x_optimized+2, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-4);
-    standardFloat_fill(    x_standard+3,  yVal3, VSX_FUNC_NUM_TEST_ELEMENTS-6);
-    THFloatVector_fill_VSX(x_optimized+3, yVal3, VSX_FUNC_NUM_TEST_ELEMENTS-6);
-    standardFloat_fill(    x_standard+517,  yVal0, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
-    THFloatVector_fill_VSX(x_optimized+517, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
-    int r = rand() % 258;
-    standardFloat_fill(    x_standard+517+r,  yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
-    THFloatVector_fill_VSX(x_optimized+517+r, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
-    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
-        assert(x_optimized[i] == x_standard[i]);
-    printf("All assertions PASSED for THFloatVector_fill_VSX() test.\n\n");
-
-
-    free(x_standard);
-    free(x_optimized);
-}
-
-
-void test_THDoubleVector_cadd_VSX()
-{
-    clock_t start, end;
-    double elapsedSeconds_optimized, elapsedSeconds_standard;
-
-    double *z_standard  = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
-    double *z_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
-    double *x           = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
-    double *y           = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
-    double c            = randDouble();
-
-    // Initialize randomly
-    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
-    {
-        x[i] = randDouble();
-        y[i] = randDouble();
-    }
-
-
-    //-------------------------------------------------
-    // Performance Test
-    //-------------------------------------------------
-    start = clock();
-    standardDouble_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS  );
-    standardDouble_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
-    standardDouble_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
-    standardDouble_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
-    end = clock();
-
-    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
-    printf("standardDouble_cadd() test took %.5lf seconds\n", elapsedSeconds_standard);
-
-    start = clock();
-    THDoubleVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS  );
-    THDoubleVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
-    THDoubleVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
-    THDoubleVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
-    end = clock();
-
-    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
-    printf("THDoubleVector_cadd_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
-
-
-    //-------------------------------------------------
-    // Correctness Test
-    //-------------------------------------------------
-    standardDouble_cadd(    z_standard+1,  x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
-    THDoubleVector_cadd_VSX(z_optimized+1, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
-    standardDouble_cadd(    z_standard+2,  x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
-    THDoubleVector_cadd_VSX(z_optimized+2, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
-    standardDouble_cadd(    z_standard+3,  x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
-    THDoubleVector_cadd_VSX(z_optimized+3, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
-    standardDouble_cadd(    z_standard+517,  x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
-    THDoubleVector_cadd_VSX(z_optimized+517, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
-    int r = rand() % 258;
-    standardDouble_cadd(    z_standard+517+r,  x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
-    THDoubleVector_cadd_VSX(z_optimized+517+r, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
-    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
-    {
-        if(!near(z_optimized[i], z_standard[i]))
-            printf("%d %f %f\n", i, z_optimized[i], z_standard[i]);
-        assert(near(z_optimized[i], z_standard[i]));
-    }
-    printf("All assertions PASSED for THDoubleVector_cadd_VSX() test.\n\n");
-
-
-    free(z_standard);
-    free(z_optimized);
-    free(x);
-}
-
-void test_THFloatVector_cadd_VSX()
-{
-    clock_t start, end;
-    double elapsedSeconds_optimized, elapsedSeconds_standard;
-
-    float *z_standard  = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
-    float *z_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
-    float *x           = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
-    float *y           = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
-    float c            = (float)randDouble();
-
-    // Initialize randomly
-    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
-    {
-        x[i] = (float)randDouble();
-        y[i] = (float)randDouble();
-    }
-
-
-    //-------------------------------------------------
-    // Performance Test
-    //-------------------------------------------------
-    start = clock();
-    standardFloat_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS  );
-    standardFloat_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
-    standardFloat_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
-    standardFloat_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
-    end = clock();
-
-    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
-    printf("standardFloat_cadd() test took %.5lf seconds\n", elapsedSeconds_standard);
-
-    start = clock();
-    THFloatVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS  );
-    THFloatVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
-    THFloatVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
-    THFloatVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
-    end = clock();
-
-    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
-    printf("THFloatVector_cadd_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
-
-
-    //-------------------------------------------------
-    // Correctness Test
-    //-------------------------------------------------
-    standardFloat_cadd(    z_standard+1,  x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
-    THFloatVector_cadd_VSX(z_optimized+1, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
-    standardFloat_cadd(    z_standard+2,  x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
-    THFloatVector_cadd_VSX(z_optimized+2, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
-    standardFloat_cadd(    z_standard+3,  x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
-    THFloatVector_cadd_VSX(z_optimized+3, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
-    standardFloat_cadd(    z_standard+517,  x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
-    THFloatVector_cadd_VSX(z_optimized+517, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
-    int r = rand() % 258;
-    standardFloat_cadd(    z_standard+517+r,  x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
-    THFloatVector_cadd_VSX(z_optimized+517+r, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
-    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
-    {
-        if(!near(z_optimized[i], z_standard[i]))
-            printf("%d %f %f\n", i, z_optimized[i], z_standard[i]);
-        assert(near(z_optimized[i], z_standard[i]));
-    }
-    printf("All assertions PASSED for THFloatVector_cadd_VSX() test.\n\n");
-
-
-    free(z_standard);
-    free(z_optimized);
-    free(x);
-}
-
-void test_THDoubleVector_adds_VSX()
-{
-    clock_t start, end;
-    double elapsedSeconds_optimized, elapsedSeconds_standard;
-
-    double *y_standard  = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
-    double *y_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
-    double *x           = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
-    double c            = randDouble();
-
-    // Initialize randomly
-    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
-        x[i] = randDouble();
-
-    //-------------------------------------------------
-    // Performance Test
-    //-------------------------------------------------
-    start = clock();
-    standardDouble_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS  );
-    standardDouble_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
-    standardDouble_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
-    standardDouble_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
-    end = clock();
-
-    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
-    printf("standardDouble_adds() test took %.5lf seconds\n", elapsedSeconds_standard);
-
-    start = clock();
-    THDoubleVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS  );
-    THDoubleVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
-    THDoubleVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
-    THDoubleVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
-    end = clock();
-
-    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
-    printf("THDoubleVector_adds_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
-
-
-    //-------------------------------------------------
-    // Correctness Test
-    //-------------------------------------------------
-    standardDouble_adds(    y_standard+1,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
-    THDoubleVector_adds_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
-    standardDouble_adds(    y_standard+2,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
-    THDoubleVector_adds_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
-    standardDouble_adds(    y_standard+3,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
-    THDoubleVector_adds_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
-    standardDouble_adds(    y_standard+517,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
-    THDoubleVector_adds_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
-    int r = rand() % 258;
-    standardDouble_adds(    y_standard+517+r,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
-    THDoubleVector_adds_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
-    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
-    {
-        if(!near(y_optimized[i], y_standard[i]))
-            printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
-        assert(near(y_optimized[i], y_standard[i]));
-    }
-    printf("All assertions PASSED for THDoubleVector_adds_VSX() test.\n\n");
-
-
-    free(y_standard);
-    free(y_optimized);
-    free(x);
-}
-
-
-void test_THFloatVector_adds_VSX()
-{
-    clock_t start, end;
-    double elapsedSeconds_optimized, elapsedSeconds_standard;
-
-    float *y_standard  = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
-    float *y_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
-    float *x           = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
-    float c            = (float)randDouble();
-
-    // Initialize randomly
-    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
-        x[i] = (float)randDouble();
-
-
-    //-------------------------------------------------
-    // Performance Test
-    //-------------------------------------------------
-    start = clock();
-    standardFloat_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS  );
-    standardFloat_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
-    standardFloat_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
-    standardFloat_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
-    end = clock();
-
-    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
-    printf("standardFloat_adds() test took %.5lf seconds\n", elapsedSeconds_standard);
-
-    start = clock();
-    THFloatVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS  );
-    THFloatVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
-    THFloatVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
-    THFloatVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
-    end = clock();
-
-    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
-    printf("THFloatVector_adds_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
-
-
-    //-------------------------------------------------
-    // Correctness Test
-    //-------------------------------------------------
-    standardFloat_adds(    y_standard+1,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
-    THFloatVector_adds_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
-    standardFloat_adds(    y_standard+2,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
-    THFloatVector_adds_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
-    standardFloat_adds(    y_standard+3,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
-    THFloatVector_adds_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
-    standardFloat_adds(    y_standard+517,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
-    THFloatVector_adds_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
-    int r = rand() % 258;
-    standardFloat_adds(    y_standard+517+r,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
-    THFloatVector_adds_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
-    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
-    {
-        if(!near(y_optimized[i], y_standard[i]))
-            printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
-        assert(near(y_optimized[i], y_standard[i]));
-    }
-    printf("All assertions PASSED for THFloatVector_adds_VSX() test.\n\n");
-
-
-    free(y_standard);
-    free(y_optimized);
-    free(x);
-}
-
-
-void test_THDoubleVector_cmul_VSX()
-{
-    clock_t start, end;
-    double elapsedSeconds_optimized, elapsedSeconds_standard;
-
-    double *z_standard  = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
-    double *z_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
-    double *x           = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
-    double *y           = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
-
-    // Initialize randomly
-    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
-    {
-        x[i] = randDouble();
-        y[i] = randDouble();
-    }
-
-
-    //-------------------------------------------------
-    // Performance Test
-    //-------------------------------------------------
-    start = clock();
-    standardDouble_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS  );
-    standardDouble_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
-    standardDouble_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
-    standardDouble_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
-    end = clock();
-
-    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
-    printf("standardDouble_cmul() test took %.5lf seconds\n", elapsedSeconds_standard);
-
-    start = clock();
-    THDoubleVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS  );
-    THDoubleVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
-    THDoubleVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
-    THDoubleVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
-    end = clock();
-
-    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
-    printf("THDoubleVector_cmul_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
-
-
-    //-------------------------------------------------
-    // Correctness Test
-    //-------------------------------------------------
-    standardDouble_cmul(    z_standard+1,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
-    THDoubleVector_cmul_VSX(z_optimized+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
-    standardDouble_cmul(    z_standard+2,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
-    THDoubleVector_cmul_VSX(z_optimized+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
-    standardDouble_cmul(    z_standard+3,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
-    THDoubleVector_cmul_VSX(z_optimized+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
-    standardDouble_cmul(    z_standard+517,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
-    THDoubleVector_cmul_VSX(z_optimized+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
-    int r = rand() % 258;
-    standardDouble_cmul(    z_standard+517+r,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
-    THDoubleVector_cmul_VSX(z_optimized+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
-    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
-    {
-        if(!near(z_optimized[i], z_standard[i]))
-            printf("%d %f %f\n", i, z_optimized[i], z_standard[i]);
-        assert(near(z_optimized[i], z_standard[i]));
-    }
-    printf("All assertions PASSED for THDoubleVector_cmul_VSX() test.\n\n");
-
-
-    free(z_standard);
-    free(z_optimized);
-    free(x);
-}
-
-void test_THFloatVector_cmul_VSX()
-{
-    clock_t start, end;
-    double elapsedSeconds_optimized, elapsedSeconds_standard;
-
-    float *z_standard  = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
-    float *z_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
-    float *x           = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
-    float *y           = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
-
-    // Initialize randomly
-    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
-    {
-        x[i] = (float)randDouble();
-        y[i] = (float)randDouble();
-    }
-
-
-    //-------------------------------------------------
-    // Performance Test
-    //-------------------------------------------------
-    start = clock();
-    standardFloat_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS  );
-    standardFloat_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
-    standardFloat_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
-    standardFloat_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
-    end = clock();
-
-    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
-    printf("standardFloat_cmul() test took %.5lf seconds\n", elapsedSeconds_standard);
-
-    start = clock();
-    THFloatVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS  );
-    THFloatVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
-    THFloatVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
-    THFloatVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
-    end = clock();
-
-    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
-    printf("THFloatVector_cmul_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
-
-
-    //-------------------------------------------------
-    // Correctness Test
-    //-------------------------------------------------
-    standardFloat_cmul(    z_standard+1,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
-    THFloatVector_cmul_VSX(z_optimized+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
-    standardFloat_cmul(    z_standard+2,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
-    THFloatVector_cmul_VSX(z_optimized+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
-    standardFloat_cmul(    z_standard+3,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
-    THFloatVector_cmul_VSX(z_optimized+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
-    standardFloat_cmul(    z_standard+517,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
-    THFloatVector_cmul_VSX(z_optimized+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
-    int r = rand() % 258;
-    standardFloat_cmul(    z_standard+517+r,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
-    THFloatVector_cmul_VSX(z_optimized+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
-    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
-    {
-        if(!near(z_optimized[i], z_standard[i]))
-            printf("%d %f %f\n", i, z_optimized[i], z_standard[i]);
-        assert(near(z_optimized[i], z_standard[i]));
-    }
-    printf("All assertions PASSED for THFloatVector_cmul_VSX() test.\n\n");
-
-
-    free(z_standard);
-    free(z_optimized);
-    free(x);
-}
-
-void test_THDoubleVector_muls_VSX()
-{
-    clock_t start, end;
-    double elapsedSeconds_optimized, elapsedSeconds_standard;
-
-    double *y_standard  = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
-    double *y_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
-    double *x           = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
-    double c            = randDouble();
-
-    // Initialize randomly
-    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
-    {
-        x[i] = randDouble();
-    }
-
-
-    //-------------------------------------------------
-    // Performance Test
-    //-------------------------------------------------
-    start = clock();
-    standardDouble_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS  );
-    standardDouble_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
-    standardDouble_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
-    standardDouble_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
-    end = clock();
-
-    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
-    printf("standardDouble_muls() test took %.5lf seconds\n", elapsedSeconds_standard);
-
-    start = clock();
-    THDoubleVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS  );
-    THDoubleVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
-    THDoubleVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
-    THDoubleVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
-    end = clock();
-
-    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
-    printf("THDoubleVector_muls_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
-
-
-    //-------------------------------------------------
-    // Correctness Test
-    //-------------------------------------------------
-    standardDouble_muls(    y_standard+1,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
-    THDoubleVector_muls_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
-    standardDouble_muls(    y_standard+2,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
-    THDoubleVector_muls_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
-    standardDouble_muls(    y_standard+3,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
-    THDoubleVector_muls_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
-    standardDouble_muls(    y_standard+517,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
-    THDoubleVector_muls_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
-    int r = rand() % 258;
-    standardDouble_muls(    y_standard+517+r,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
-    THDoubleVector_muls_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
-
-    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
-    {
-        if(!near(y_optimized[i], y_standard[i]))
-            printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
-        assert(near(y_optimized[i], y_standard[i]));
-    }
-    printf("All assertions PASSED for THDoubleVector_muls_VSX() test.\n\n");
-
-
-    free(y_standard);
-    free(y_optimized);
-    free(x);
-}
-
-void test_THFloatVector_muls_VSX()
-{
-    clock_t start, end;
-    double elapsedSeconds_optimized, elapsedSeconds_standard;
-
-    float *y_standard  = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
-    float *y_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
-    float *x           = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
-    float c           = (float)randDouble();
-
-    // Initialize randomly
-    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
-    {
-        x[i] = (float)randDouble();
-    }
-
-
-    //-------------------------------------------------
-    // Performance Test
-    //-------------------------------------------------
-    start = clock();
-    standardFloat_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS  );
-    standardFloat_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
-    standardFloat_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
-    standardFloat_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
-    end = clock();
-
-    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
-    printf("standardFloat_muls() test took %.5lf seconds\n", elapsedSeconds_standard);
-
-    start = clock();
-    THFloatVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS  );
-    THFloatVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
-    THFloatVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
-    THFloatVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
-    end = clock();
-
-    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
-    printf("THFloatVector_muls_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
-
-
-    //-------------------------------------------------
-    // Correctness Test
-    //-------------------------------------------------
-    standardFloat_muls(    y_standard+1,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
-    THFloatVector_muls_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
-    standardFloat_muls(    y_standard+2,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
-    THFloatVector_muls_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
-    standardFloat_muls(    y_standard+3,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
-    THFloatVector_muls_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
-    standardFloat_muls(    y_standard+517,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
-    THFloatVector_muls_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
-    int r = rand() % 258;
-    standardFloat_muls(    y_standard+517+r,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
-    THFloatVector_muls_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
-    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
-    {
-        if(!near(y_optimized[i], y_standard[i]))
-            printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
-        assert(near(y_optimized[i], y_standard[i]));
-    }
-    printf("All assertions PASSED for THFloatVector_muls_VSX() test.\n\n");
-
-
-    free(y_standard);
-    free(y_optimized);
-    free(x);
-}
-
-
-
-void test_THDoubleVector_cdiv_VSX()
-{
-    clock_t start, end;
-    double elapsedSeconds_optimized, elapsedSeconds_standard;
-
-    double *z_standard  = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
-    double *z_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
-    double *x           = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
-    double *y           = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
-
-    // Initialize randomly
-    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
-    {
-        x[i] = randDouble();
-        y[i] = randDouble();
-    }
-
-
-    //-------------------------------------------------
-    // Performance Test
-    //-------------------------------------------------
-    start = clock();
-    standardDouble_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS  );
-    standardDouble_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
-    standardDouble_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
-    standardDouble_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
-    end = clock();
-
-    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
-    printf("standardDouble_cdiv() test took %.5lf seconds\n", elapsedSeconds_standard);
-
-    start = clock();
-    THDoubleVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS  );
-    THDoubleVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
-    THDoubleVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
-    THDoubleVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
-    end = clock();
-
-    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
-    printf("THDoubleVector_cdiv_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
-
-
-    //-------------------------------------------------
-    // Correctness Test
-    //-------------------------------------------------
-    standardDouble_cdiv(    z_standard+1,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
-    THDoubleVector_cdiv_VSX(z_optimized+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
-    standardDouble_cdiv(    z_standard+2,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
-    THDoubleVector_cdiv_VSX(z_optimized+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
-    standardDouble_cdiv(    z_standard+3,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
-    THDoubleVector_cdiv_VSX(z_optimized+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
-    standardDouble_cdiv(    z_standard+517,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
-    THDoubleVector_cdiv_VSX(z_optimized+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
-    int r = rand() % 258;
-    standardDouble_cdiv(    z_standard+517+r,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
-    THDoubleVector_cdiv_VSX(z_optimized+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
-    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
-    {
-        if(!near(z_optimized[i], z_standard[i]))
-            printf("%d %f %f\n", i, z_optimized[i], z_standard[i]);
-        assert(near(z_optimized[i], z_standard[i]));
-    }
-    printf("All assertions PASSED for THDoubleVector_cdiv_VSX() test.\n\n");
-
-
-    free(z_standard);
-    free(z_optimized);
-    free(x);
-}
-
-void test_THFloatVector_cdiv_VSX()
-{
-    clock_t start, end;
-    double elapsedSeconds_optimized, elapsedSeconds_standard;
-
-    float *z_standard  = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
-    float *z_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
-    float *x           = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
-    float *y           = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
-
-    // Initialize randomly
-    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
-    {
-        x[i] = (float)randDouble();
-        y[i] = (float)randDouble();
-    }
-
-
-    //-------------------------------------------------
-    // Performance Test
-    //-------------------------------------------------
-    start = clock();
-    standardFloat_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS  );
-    standardFloat_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
-    standardFloat_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
-    standardFloat_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
-    end = clock();
-
-    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
-    printf("standardFloat_cdiv() test took %.5lf seconds\n", elapsedSeconds_standard);
-
-    start = clock();
-    THFloatVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS  );
-    THFloatVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
-    THFloatVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
-    THFloatVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
-    end = clock();
-
-    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
-    printf("THFloatVector_cdiv_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
-
-
-    //-------------------------------------------------
-    // Correctness Test
-    //-------------------------------------------------
-    standardFloat_cdiv(    z_standard+1,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
-    THFloatVector_cdiv_VSX(z_optimized+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
-    standardFloat_cdiv(    z_standard+2,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
-    THFloatVector_cdiv_VSX(z_optimized+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
-    standardFloat_cdiv(    z_standard+3,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
-    THFloatVector_cdiv_VSX(z_optimized+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
-    standardFloat_cdiv(    z_standard+517,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
-    THFloatVector_cdiv_VSX(z_optimized+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
-    int r = rand() % 258;
-    standardFloat_cdiv(    z_standard+517+r,  x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
-    THFloatVector_cdiv_VSX(z_optimized+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
-    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
-    {
-        if(!near(z_optimized[i], z_standard[i]))
-            printf("%d %f %f\n", i, z_optimized[i], z_standard[i]);
-        assert(near(z_optimized[i], z_standard[i]));
-    }
-    printf("All assertions PASSED for THFloatVector_cdiv_VSX() test.\n\n");
-
-
-    free(z_standard);
-    free(z_optimized);
-    free(x);
-}
-
-void test_THDoubleVector_divs_VSX()
-{
-    clock_t start, end;
-    double elapsedSeconds_optimized, elapsedSeconds_standard;
-
-    double *y_standard  = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
-    double *y_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
-    double *x           = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
-    double c            = randDouble();
-
-    // Initialize randomly
-    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
-    {
-        x[i] = randDouble();
-    }
-
-
-    //-------------------------------------------------
-    // Performance Test
-    //-------------------------------------------------
-    start = clock();
-    standardDouble_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS  );
-    standardDouble_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
-    standardDouble_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
-    standardDouble_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
-    end = clock();
-
-    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
-    printf("standardDouble_divs() test took %.5lf seconds\n", elapsedSeconds_standard);
-
-    start = clock();
-    THDoubleVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS  );
-    THDoubleVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
-    THDoubleVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
-    THDoubleVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
-    end = clock();
-
-    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
-    printf("THDoubleVector_divs_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
-
-
-    //-------------------------------------------------
-    // Correctness Test
-    //-------------------------------------------------
-    standardDouble_divs(    y_standard+1,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
-    THDoubleVector_divs_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
-    standardDouble_divs(    y_standard+2,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
-    THDoubleVector_divs_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
-    standardDouble_divs(    y_standard+3,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
-    THDoubleVector_divs_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
-    standardDouble_divs(    y_standard+517,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
-    THDoubleVector_divs_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
-    int r = rand() % 258;
-    standardDouble_divs(    y_standard+517+r,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
-    THDoubleVector_divs_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
-
-    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
-    {
-        if(!near(y_optimized[i], y_standard[i]))
-            printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
-        assert(near(y_optimized[i], y_standard[i]));
-    }
-    printf("All assertions PASSED for THDoubleVector_divs_VSX() test.\n\n");
-
-
-    free(y_standard);
-    free(y_optimized);
-    free(x);
-}
-
-void test_THFloatVector_divs_VSX()
-{
-    clock_t start, end;
-    double elapsedSeconds_optimized, elapsedSeconds_standard;
-
-    float *y_standard  = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
-    float *y_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
-    float *x           = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
-    float c            = (float)randDouble();
-
-    // Initialize randomly
-    for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
-    {
-        x[i] = (float)randDouble();
-    }
-
-
-    //-------------------------------------------------
-    // Performance Test
-    //-------------------------------------------------
-    start = clock();
-    standardFloat_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS  );
-    standardFloat_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
-    standardFloat_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
-    standardFloat_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
-    end = clock();
-
-    elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
-    printf("standardFloat_divs() test took %.5lf seconds\n", elapsedSeconds_standard);
-
-    start = clock();
-    THFloatVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS  );
-    THFloatVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
-    THFloatVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
-    THFloatVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
-    end = clock();
-
-    elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
-    printf("THFloatVector_divs_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
-
-
-    //-------------------------------------------------
-    // Correctness Test
-    //-------------------------------------------------
-    standardFloat_divs(    y_standard+1,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
-    THFloatVector_divs_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
-    standardFloat_divs(    y_standard+2,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
-    THFloatVector_divs_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
-    standardFloat_divs(    y_standard+3,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
-    THFloatVector_divs_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
-    standardFloat_divs(    y_standard+517,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
-    THFloatVector_divs_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
-    int r = rand() % 258;
-    standardFloat_divs(    y_standard+517+r,  x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
-    THFloatVector_divs_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
-
-    for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
-    {
-        if(!near(y_optimized[i], y_standard[i]))
-            printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
-        assert(near(y_optimized[i], y_standard[i]));
-    }
-    printf("All assertions PASSED for THFloatVector_divs_VSX() test.\n\n");
-
-
-    free(y_standard);
-    free(y_optimized);
-    free(x);
-}
-
-
-//--------------------------------------------------------------------------------------------------
-// Run tests:
-//--------------------------------------------------------------------------------------------------
-int main()
-{
-    printf("\n");
-
-
-    // First test utility functions
-
-    assert(!near(0.1, -0.1));
-    assert(!near(0.1f, -0.1f));
-    assert(!near(9, 10));
-    assert(near(0.1, 0.1000001));
-    assert(near(0.1f, 0.1000001f));
-    assert(near(100.764, 100.764));
-    assert(!near(NAN, 0.0));
-    assert(!near(-9.5, NAN));
-    assert(!near(NAN, 100));
-    assert(!near(-0.0, NAN));
-    assert(near(NAN, NAN));
-    assert(near(INFINITY, INFINITY));
-    assert(near(-INFINITY, -INFINITY));
-    assert(!near(INFINITY, NAN));
-    assert(!near(0, INFINITY));
-    assert(!near(-999.4324, INFINITY));
-    assert(!near(INFINITY, 982374.1));
-    assert(!near(-INFINITY, INFINITY));
-
-
-
-    // Then test each vectorized function
-
-    test_THDoubleVector_fill_VSX();
-    test_THFloatVector_fill_VSX();
-
-    test_THDoubleVector_cadd_VSX();
-    test_THFloatVector_cadd_VSX();
-
-    test_THDoubleVector_adds_VSX();
-    test_THFloatVector_adds_VSX();
-
-    test_THDoubleVector_cmul_VSX();
-    test_THFloatVector_cmul_VSX();
-
-    test_THDoubleVector_muls_VSX();
-    test_THFloatVector_muls_VSX();
-
-    test_THDoubleVector_cdiv_VSX();
-    test_THFloatVector_cdiv_VSX();
-
-    test_THDoubleVector_divs_VSX();
-    test_THFloatVector_divs_VSX();
-
-
-
-    printf("Finished runnning all tests. All tests PASSED.\n");
-    return 0;
-}
-
-
-#endif  // defined RUN_VSX_TESTS
-
-#endif  // defined __PPC64__
-
diff --git a/contrib/lua-torch/torch7/lib/luaT/CMakeLists.txt b/contrib/lua-torch/torch7/lib/luaT/CMakeLists.txt
deleted file mode 100644
index 518c407f2..000000000
--- a/contrib/lua-torch/torch7/lib/luaT/CMakeLists.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-# avoid some cmake warnings
-
-INCLUDE_DIRECTORIES(${LUA_INCDIR})
-IF(LUALIB)
-  LINK_DIRECTORIES(${LUA_LIBDIR}) # note: must be done before defining target
-ENDIF()
-
-ADD_LIBRARY(luaT STATIC luaT.h luaT.c)
-
-IF(LUALIB)
-  TARGET_LINK_LIBRARIES(luaT ${LUALIB}) # must be done after ;)
-ENDIF()
diff --git a/contrib/lua-torch/torch7/lib/luaT/README.md b/contrib/lua-torch/torch7/lib/luaT/README.md
deleted file mode 100644
index 235b8edc0..000000000
--- a/contrib/lua-torch/torch7/lib/luaT/README.md
+++ /dev/null
@@ -1,266 +0,0 @@
-<a name="luat.dok"></a>
-# Lua Torch C API #
-
-luaT provides an API to interface Lua and C in Torch packages. It defines a
-concept of _classes_ to Lua for Torch, and provides a mechanism to easily
-handle these Lua classes from C.
-
-It additionally provides few functions that `luaL` should have defined, and
-defines several functions similar to `luaL` ones for better type error printing when using
-`luaT` classes.
-
-<a name="luat.memory.dok"></a>
-## Memory functions ##
-
-Classical memory allocation functions which generate a Lua error in case of
-problem.
-
-<a name="luaT_alloc"></a>
-### void* luaT_alloc(lua_State *L, long size) ###
-
-Allocates `size` bytes, and return a pointer on the allocated
-memory. A Lua error will be generated if running out of memory.
-
-<a name="luaT_realloc"></a>
-### void* luaT_realloc(lua_State *L, void *ptr, long size) ###
-
-Realloc `ptr` to `size` bytes. `ptr` must have been previously
-allocated with [luaT_alloc](#luaT_alloc) or
-[luaT_realloc](#luaT_realloc), or the C `malloc` or `realloc`
-functions. A Lua error will be generated if running out of memory.
-
-<a name="luaT_free"></a>
-### void luaT_free(lua_State *L, void *ptr) ###
-
-Free memory allocated at address `ptr`. The memory must have been
-previously allocated with [luaT_alloc](#luaT_alloc) or
-[luaT_realloc](#luaT_realloc), or the C `malloc` or `realloc`
-functions.
-
-<a name="luat.classcreate"></a>
-## Class creation and basic handling ##
-
-A `luaT` class is basically either a Lua _table_ or _userdata_ with
-an appropriate _metatable_. This appropriate metatable is created with
-[luaT_newmetatable](#luaT_newmetatable). Contrary to luaL userdata
-functions, luaT mechanism handles inheritance. If the class inherit from
-another class, then the metatable will itself have a metatable
-corresponding to the _parent metatable_: the metatables are cascaded
-according to the class inheritance. Multiple inheritance is not supported.
-
-<a name="luat.operatoroverloading"></a>
-### Operator overloading ###
-
-The metatable of a `luaT` object contains `Lua` operators like
-`__index`, `__newindex`, `__tostring`, `__add`
-(etc...). These operators will respectively look for `__index__`,
-`__newindex__`, `__tostring__`, `__add__` (etc...) in the
-metatable. If found, the corresponding function or value will be returned,
-else a Lua error will be raised.
-
-If one wants to provide `__index__` or `__newindex__` in the
-metaclass, these operators must follow a particular scheme:
-
-  * `__index__` must either return a value _and_ `true` or return `false` only. In the first case, it means `__index__` was able to handle the given argument (for e.g., the type was correct). The second case means it was not able to do anything, so `__index` in the root metatable can then try to see if the metaclass contains the required value.
-
-  * `__newindex__` must either return `true` or `false`. As for `__index__`, `true` means it could handle the argument and `false` not. If not, the root metatable `__newindex` will then raise an error if the object was a userdata, or apply a rawset if the object was a Lua table.
-
-Other metaclass operators like `__tostring__`, `__add__`, etc... do not have any particular constraint.
-
-<a name="luat_newlocalmetatable"></a>
-### const char* luaT_newlocalmetatable(lua_State *L, const char *tname, const char *parenttname, lua_CFunction constructor, lua_CFunction destructor, lua_CFunction factory, int moduleidx) ###
-
-This function creates a new metatable, which is the Lua way to define a new
-object class. As for `luaL_newmetatable`, the metatable is registered in
-the Lua registry table, with the key `tname`. In addition, `tname` is
-also registered in the Lua registry, with the metatable as key (the
-typename of a given object can be thus easily retrieved).
-
-The class name `tname` must be of the form `modulename.classname`. If not
-NULL, `parenttname` must be a valid typename corresponding to the parent
-class of the new class.
-
-If `constructor` is not NULL, a function `new` will be added to the
-metatable, pointing to this given function.
-
-A "constructor table" will be created by `luaT_newlocalmetatable`: it will
-contain all the class methods, and be callable, calling the `constructor`, if
-a `constructor` has been passed. The constructor table is either stored into
-`modulename.classname` (that is in the global namespace) if `moduleidx <=
-0` or in the table at index `moduleidx` in the stack (if `moduleidx > 0`).
-
-If not NULL, `destructor` will be called when garbage collecting the object.
-
-If not NULL, `factory` must be a Lua C function creating an empty object
-instance of the class. This functions are used in Torch for serialization.
-
-Note that classes can be partly defined in C and partly defined in Lua:
-once the metatable is created in C, it can be filled up with additional
-methods in Lua.
-
-The return value is the value returned by [luaT_typenameid](#luat_typenameid).
-
-<a name="luat_newmetatable"></a>
-### const char* luaT_newmetatable(lua_State *L, const char *tname, const char *parenttname, lua_CFunction constructor, lua_CFunction destructor, lua_CFunction factory) ###
-
-Same as [luaT_newlocalmetatable](#luat_newmetatable), but where the
-constructor table is assigned in the global namespace (`moduleidx = 0`).
-
-<a name="luat_pushmetatable"></a>
-### int luaT_pushmetatable(lua_State *L, const name *tname) ###
-
-Push the metatable with type name `tname` on the stack, if `tname` is a
-valid Torch class name (previously registered with luaT_newmetatable).
-
-On success, returns 1. If `tname` is invalid, nothing is pushed and it
-returns 0.
-
-<a name="luat_typenameid"></a>
-### const char* luaT_typenameid(lua_State *L, const char *tname) ###
-
-If `tname` is a valid Torch class name, then returns a unique string (the
-contents will be the same as `tname`) pointing to the string registered
-in the Lua registry. This string is thus valid as long as Lua is
-running. The returned string shall not be freed.
-
-If `tname` is an invalid class name, returns NULL.
-
-<a name="luat_typename"></a>
-### const char* luaT_typename(lua_State *L, int ud) ###
-
-Returns the typename of the object at index `ud` on the stack. If it is
-not a valid Torch object, returns NULL.
-
-<a name="luat_pushudata"></a>
-### void luaT_pushudata(lua_State *L, void *udata, const char *tname) ###
-
-Given a C structure `udata`, push a userdata object on the stack with
-metatable corresponding to `tname`. Obviously, `tname` must be a valid
-Torch name registered with [luaT_newmetatable](#luat_newmetatable).
-
-<a name="luat_toudata"></a>
-### void *luaT_toudata(lua_State *L, int ud, const char *tname) ###
-
-Returns a pointer to the original C structure previously pushed on the
-stack with [luaT_pushudata](#luat_pushudata), if the object at index
-`ud` is a valid Torch class name. Returns NULL otherwise.
-
-<a name="luat_isudata"></a>
-### int luaT_isudata(lua_State *L, int ud, const char *tname) ###
-
-Returns 1 if the object at index `ud` on the stack is a valid Torch class name `tname`.
-Returns 0 otherwise.
-
-<a name="luat_getfield"></a>
-### Checking fields of a table ###
-
-This functions check that the table at the given index `ud` on the Lua
-stack has a field named `field`, and that it is of the specified type.
-These function raises a Lua error on failure.
-
-<a name="luat_getfieldcheckudata"></a>
-## void *luaT_getfieldcheckudata(lua_State *L, int ud, const char *field, const char *tname) ##
-
-Checks that the field named `field` of the table at index `ud` is a
-Torch class name `tname`.  Returns the pointer of the C structure
-previously pushed on the stack with [luaT_pushudata](#luat_pushudata) on
-success. The function raises a Lua error on failure.
-
-<a name="luat_getfieldchecklightudata"></a>
-## void *luaT_getfieldchecklightudata(lua_State *L, int ud, const char *field) ##
-
-Checks that the field named `field` of the table at index `ud` is a
-lightuserdata.  Returns the lightuserdata pointer on success. The function
-raises a Lua error on failure.
-
-<a name="luat_getfieldcheckint"></a>
-## int luaT_getfieldcheckint(lua_State *L, int ud, const char *field) ##
-
-Checks that the field named `field` of the table at index `ud` is an
-int. Returns the int value pointer on success. The function raises a Lua
-error on failure.
-
-<a name="luat_getfieldcheckstring"></a>
-## const char* luaT_getfieldcheckstring(lua_State *L, int ud, const char *field) ##
-
-Checks that the field named `field` of the table at index `ud` is a
-string. Returns a pointer to the string on success. The function raises a
-Lua error on failure.
-
-<a name="luat_getfieldcheckboolean"></a>
-## int luaT_getfieldcheckboolean(lua_State *L, int ud, const char *field) ##
-
-Checks that the field named `field` of the table at index `ud` is a
-boolean. On success, returns 1 if the boolean is `true`, 0 if it is
-`false`. The function raises a Lua error on failure.
-
-<a name="luat_getfieldchecktable"></a>
-## void luaT_getfieldchecktable(lua_State *L, int ud, const char *field) ##
-
-Checks that the field named `field` of the table at index `ud` is a
-table. On success, push the table on the stack. The function raises a Lua
-error on failure.
-
-<a name="luat_typerror"></a>
-### int luaT_typerror(lua_State *L, int ud, const char *tname) ###
-
-Raises a `luaL_argerror` (and returns its value), claiming that the
-object at index `ud` on the stack is not of type `tname`. Note that
-this function does not check the type, it only raises an error.
-
-<a name="luat_checkboolean"></a>
-### int luaT_checkboolean(lua_State *L, int ud) ###
-
-Checks that the value at index `ud` is a boolean. On success, returns 1
-if the boolean is `true`, 0 if it is `false`. The function raises a Lua
-error on failure.
-
-<a name="luat_optboolean"></a>
-### int luaT_optboolean(lua_State *L, int ud, int def) ###
-
-Checks that the value at index `ud` is a boolean. On success, returns 1
-if the boolean is `true`, 0 if it is `false`. If there is no value at
-index `ud`, returns `def`. In any other cases, raises an error.
-
-<a name="luat_registeratname"></a>
-### void luaT_registeratname(lua_State *L, const struct luaL_Reg *methods, const char *name) ###
-
-This function assume a table is on the stack. It creates a table field
-`name` in the table (if this field does not exist yet), and fill up
-`methods` in this table field.
-
-<a name="luat_classrootname"></a>
-### const char *luaT_classrootname(const char *tname) ###
-
-Assuming `tname` is of the form `A.b.c`, returns 'c'. The returned value
-shall not be freed. It is a pointer inside `tname` string.
-
-<a name="luat_classmodulename"></a>
-### int luaT_classmodulename(const char *tname, char *parent_name) ###
-Alias to `luaT_fullparentname ` for ensuring backwards compatibility;
-use of `luaT_fullparentname` is preferred.
-
-<a name="luat_fullparentname"></a>
-### int luaT_fullparentname(const char *tname, char *parent_name) ###
-
-Returns a 0-1 valued integer indicating whether `tname` has a parent module.
-Assuming `tname` is of the form `A.b.c`, sets `parent_name` to `A.b`.
-
-<a name="luat_classmodulename"></a>
-### int luaT_outerparentname(const char *tname, char *parent_name) ###
-
-Returns a 0-1 valued integer indicating whether `tname` has a parent module.
-Assuming `tname` is of the form `A.b.c`, sets `parent_name` to `A`.
-
-<a name="luat_classmodulename"></a>
-### int luaT_innerparentname(const char *tname, char *parent_name) ###
-
-Returns a 0-1 valued integer indicating whether `tname` has a parent module.
-Assuming `tname` is of the form `A.b.c`, sets `parent_name` to `b`.
-
-<a name="luat_stackdump"></a>
-### void luaT_stackdump(lua_State *L) ###
-
-This function print outs the state of the Lua stack. It is useful for debug
-purposes.
-
diff --git a/contrib/lua-torch/torch7/lib/luaT/luaT.c b/contrib/lua-torch/torch7/lib/luaT/luaT.c
deleted file mode 100644
index d87f5d54c..000000000
--- a/contrib/lua-torch/torch7/lib/luaT/luaT.c
+++ /dev/null
@@ -1,1373 +0,0 @@
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-
-#include "luaT.h"
-
-void* luaT_alloc(lua_State *L, ptrdiff_t size)
-{
-  void *ptr;
-
-  if(size == 0)
-    return NULL;
-
-  if(size < 0)
-    luaL_error(L, "$ Torch: invalid memory size -- maybe an overflow?");
-
-  ptr = malloc(size);
-  if(!ptr)
-    luaL_error(L, "$ Torch: not enough memory: you tried to allocate %dGB. Buy new RAM!", size/1073741824);
-
-  return ptr;
-}
-
-void* luaT_realloc(lua_State *L, void *ptr, ptrdiff_t size)
-{
-  if(!ptr)
-    return(luaT_alloc(L, size));
-
-  if(size == 0)
-  {
-    luaT_free(L, ptr);
-    return NULL;
-  }
-
-  if(size < 0)
-    luaL_error(L, "$ Torch: invalid memory size -- maybe an overflow?");
-
-  ptr = realloc(ptr, size);
-  if(!ptr)
-    luaL_error(L, "$ Torch: not enough memory: you tried to reallocate %dGB. Buy new RAM!", size/1073741824);
-  return ptr;
-}
-
-void luaT_free(lua_State *L, void *ptr)
-{
-  free(ptr);
-}
-
-void luaT_setfuncs(lua_State *L, const luaL_Reg *l, int nup)
-{
-#if LUA_VERSION_NUM == 501
-  luaL_checkstack(L, nup+1, "too many upvalues");
-  for (; l->name != NULL; l++) {  /* fill the table with given functions */
-    int i;
-    lua_pushstring(L, l->name);
-    for (i = 0; i < nup; i++)  /* copy upvalues to the top */
-      lua_pushvalue(L, -(nup+1));
-    lua_pushcclosure(L, l->func, nup);  /* closure with those upvalues */
-    lua_settable(L, -(nup + 3));
-  }
-  lua_pop(L, nup);  /* remove upvalues */
-#else
-  luaL_setfuncs(L, l, nup);
-#endif
-}
-
-void luaT_stackdump(lua_State *L)
-{
-  int i;
-  const char *tname = NULL;
-  int top = lua_gettop(L);
-  for(i = 1; i <= top; i++)
-  {
-    int t = lua_type(L, i);
-    printf("%3d. ", i);
-    switch(t)
-    {
-      case LUA_TSTRING:
-        printf("'%s'", lua_tostring(L,i));
-        break;
-      case LUA_TBOOLEAN:
-        printf(lua_toboolean(L, i) ? "true" : "false");
-        break;
-      case LUA_TNUMBER:
-        printf("%g", lua_tonumber(L,i));
-        break;
-      case LUA_TUSERDATA:
-        tname = luaT_typename(L, i);
-        printf("userdata %p [%s]", lua_topointer(L, i), (tname ? tname : "not a Torch object"));
-        break;
-      case 10:
-        tname = luaT_typename(L, i);
-        printf("cdata %p [%s]", lua_topointer(L, i), (tname ? tname : "not a Torch object"));
-        break;
-      case LUA_TTABLE:
-        lua_pushvalue(L, i);
-        lua_rawget(L, LUA_REGISTRYINDEX);
-        if(lua_isstring(L, -1))
-          tname = lua_tostring(L, -1); /*luaT_typenameid(L, lua_tostring(L, -1)); */
-        else
-          tname = NULL;
-        lua_pop(L, 1);
-        if(tname)
-          printf("metatable [%s]", tname);
-        else
-        {
-          tname = luaT_typename(L, i);
-          printf("table %p [%s]", lua_topointer(L, i), (tname ? tname : "not a Torch object"));
-        }
-        break;
-      default:
-        printf("Lua object type: %s", lua_typename(L,t));
-        break;
-    }
-    printf("\n");
-  }
-  printf("---------------------------------------------\n");
-}
-
-/* metatable operator methods */
-static int luaT_mt__index(lua_State *L);
-static int luaT_mt__newindex(lua_State *L);
-static int luaT_mt__tostring(lua_State *L);
-static int luaT_mt__add(lua_State *L);
-static int luaT_mt__sub(lua_State *L);
-static int luaT_mt__mul(lua_State *L);
-static int luaT_mt__div(lua_State *L);
-static int luaT_mt__mod(lua_State *L);
-static int luaT_mt__pow(lua_State *L);
-static int luaT_mt__unm(lua_State *L);
-static int luaT_mt__concat(lua_State *L);
-static int luaT_mt__len(lua_State *L);
-static int luaT_mt__eq(lua_State *L);
-static int luaT_mt__lt(lua_State *L);
-static int luaT_mt__le(lua_State *L);
-static int luaT_mt__call(lua_State *L);
-
-/* Constructor-metatable methods */
-static int luaT_cmt__call(lua_State *L);
-static int luaT_cmt__newindex(lua_State *L);
-
-const char* luaT_newmetatable(lua_State *L, const char *tname, const char *parent_tname,
-                              lua_CFunction constructor, lua_CFunction destructor, lua_CFunction factory)
-{
-  return luaT_newlocalmetatable(L, tname, parent_tname,
-                                constructor, destructor, factory, 0);
-}
-
-const char* luaT_newlocalmetatable(lua_State *L, const char *tname, const char *parent_tname,
-                                   lua_CFunction constructor, lua_CFunction destructor, lua_CFunction factory, int moduleidx)
-{
-  lua_pushcfunction(L, luaT_lua_newmetatable);
-  lua_pushstring(L, tname);
-  (parent_tname ? (void)lua_pushstring(L, parent_tname) : lua_pushnil(L));
-  (constructor ? lua_pushcfunction(L, constructor) : lua_pushnil(L));
-  (destructor ? lua_pushcfunction(L, destructor) : lua_pushnil(L));
-  (factory ? lua_pushcfunction(L, factory) : lua_pushnil(L));
-  (moduleidx > 0 ? lua_pushvalue(L, moduleidx) : lua_pushnil(L));
-  lua_call(L, 6, 1);
-  return luaT_typenameid(L, tname);
-}
-
-int luaT_pushmetatable(lua_State *L, const char *tname)
-{
-  lua_getfield(L, LUA_REGISTRYINDEX, tname);
-  if(lua_isnil(L, -1))
-  {
-    lua_pop(L, 1);
-    return 0;
-  }
-  return 1;
-}
-
-const char *luaT_typenameid(lua_State *L, const char *tname)
-{
-  if(luaT_pushmetatable(L, tname))
-  {
-    const char *tnameid = NULL;
-    lua_rawget(L, LUA_REGISTRYINDEX);
-    if(lua_isstring(L, -1))
-      tnameid = lua_tostring(L, -1);
-    lua_pop(L, 1); /* the string/nil */
-    return tnameid;
-  }
-  return NULL;
-}
-
-static const char cdataname[] = ""
-  "local ok, ffi = pcall(require, 'ffi')\n"
-  "if ok then\n"
-  "  local id2name = {}\n"
-  "  return function(cdata, name)\n"
-  "    local id\n"
-  "    if jit then\n"
-  "      id = tonumber(ffi.typeof(cdata))\n"
-  "    else\n"
-  "      id = tostring(ffi.typeof(cdata))\n"
-  "    end\n"
-  "    if id then\n"
-  "      if name then\n"
-  "        id2name[id] = name\n"
-  "        return name\n"
-  "      else\n"
-  "        return rawget(id2name, id)\n"
-  "      end\n"
-  "    end\n"
-  "    return nil\n"
-  "  end\n"
-  "else\n"
-  "  return function() end\n"
-  "end\n";
-
-static const char* luaT_cdataname(lua_State *L, int ud, const char *tname)
-{
-  lua_pushstring(L, "__cdataname");
-  lua_rawget(L, LUA_REGISTRYINDEX);
-  if(lua_isnil(L,-1))
-  {
-    lua_pop(L, 1);
-
-    if(luaL_dostring(L, cdataname)) /* did something go wrong? */
-      luaL_error(L, "internal error (could not load cdataname): %s", lua_tostring(L, -1));
-
-    lua_pushstring(L, "__cdataname");
-    lua_pushvalue(L, -2);
-    lua_rawset(L, LUA_REGISTRYINDEX);
-  }
-  if(!lua_isfunction(L, -1)) /* should not happen */
-    luaL_error(L, "internal error (cdataname is not a function)");
-
-  lua_pushvalue(L, ud);
-  if(tname)
-    lua_pushstring(L, tname);
-  if(lua_pcall(L, (tname ? 2 : 1), 1, 0))
-    luaL_error(L, "internal error (cdataname): %s", lua_tostring(L, -1));
-
-  tname = lua_tostring(L, -1);
-  lua_pop(L, 1);
-
-  return tname;
-}
-
-static void* CDATA_MT_KEY = &CDATA_MT_KEY;
-static const char cdatamt[] = ""
-  "local ok, ffi = pcall(require, 'ffi')\n"
-  "if ok and not jit then\n"
-  "  return ffi.debug().cdata_mt\n"
-  "else\n"
-  "  return {}\n"
-  "end\n";
-
-static int luaT_iscdata(lua_State *L, int ud)
-{
-  int type = lua_type(L, ud);
-  if(type == 10)
-    return 1;
-  if(type != LUA_TUSERDATA)
-    return 0;
-  if(!lua_getmetatable(L, ud))
-    return 0;
-
-  lua_pushlightuserdata(L, CDATA_MT_KEY);
-  lua_rawget(L, LUA_REGISTRYINDEX);
-  if (lua_isnil(L, -1))
-  {
-    // initialize cdata metatable
-    lua_pop(L, 1);
-    if(luaL_dostring(L, cdatamt))
-      luaL_error(L, "internal error (could not load cdata mt): %s", lua_tostring(L, -1));
-
-    lua_pushlightuserdata(L, CDATA_MT_KEY);
-    lua_pushvalue(L, -2);
-    lua_rawset(L, LUA_REGISTRYINDEX);
-  }
-
-  int iscdata = lua_rawequal(L, -1, -2);
-  lua_pop(L, 2);
-  return iscdata;
-}
-
-const char* luaT_typename(lua_State *L, int ud)
-{
-  if(luaT_iscdata(L, ud))
-    return luaT_cdataname(L, ud, NULL);
-  else if(lua_getmetatable(L, ud))
-  {
-    const char *tname = NULL;
-    lua_rawget(L, LUA_REGISTRYINDEX);
-    if(lua_isstring(L, -1))
-      tname = lua_tostring(L, -1);
-    lua_pop(L, 1); /* the string/nil */
-    return tname;
-  }
-  return NULL;
-}
-
-void luaT_pushudata(lua_State *L, void *udata, const char *tname)
-{
-  if(udata)
-  {
-    void **udata_p = lua_newuserdata(L, sizeof(void*));
-    *udata_p = udata;
-    if(!luaT_pushmetatable(L, tname))
-      luaL_error(L, "Torch internal problem: cannot find metatable for type <%s>", tname);
-    lua_setmetatable(L, -2);
-  }
-  else
-    lua_pushnil(L);
-}
-
-void *luaT_toudata(lua_State *L, int ud, const char *tname)
-{
-  void **p = lua_touserdata(L, ud);
-  if(p != NULL) /* value is a userdata? */
-  {
-    if(!luaT_pushmetatable(L, tname))
-      luaL_error(L, "Torch internal problem: cannot find metatable for type <%s>", tname);
-
-    /* initialize the table we want to get the metatable on */
-    /* note that we have to be careful with indices, as we just inserted stuff */
-    lua_pushvalue(L, (ud < 0 ? ud - 1 : ud));
-    while(lua_getmetatable(L, -1)) /* get the next metatable */
-    {
-      lua_remove(L, -2); /* remove the previous metatable [or object, if first time] */
-      if(lua_rawequal(L, -1, -2))
-      {
-        lua_pop(L, 2);  /* remove the two metatables */
-        return *p;
-      }
-    }
-    lua_pop(L, 2); /* remove the two metatables */
-  }
-  return NULL;
-}
-
-int luaT_isudata(lua_State *L, int ud, const char *tname)
-{
-  if(luaT_toudata(L, ud, tname))
-    return 1;
-  else
-    return 0;
-}
-
-void *luaT_checkudata(lua_State *L, int ud, const char *tname)
-{
-  void *p = luaT_toudata(L, ud, tname);
-  if(!p)
-    luaT_typerror(L, ud, tname);
-  return p;
-}
-
-void luaT_pushlong(lua_State *L, long n)
-{
-#if LUA_VERSION_NUM >= 503
-  /* Only push the value as an integer if it fits in lua_Integer,
-   or if the lua_Number representation will be even worse */
-  if (sizeof(lua_Integer) >= sizeof(long) || sizeof(lua_Number) <= sizeof(lua_Integer)) {
-    lua_pushinteger(L, n);
-  } else {
-    lua_pushnumber(L, (lua_Number)n);
-  }
-#else
-  lua_pushnumber(L, (lua_Number)n);
-#endif
-}
-
-long luaT_checklong(lua_State *L, int idx)
-{
-#if LUA_VERSION_NUM >= 503
-  if (sizeof(lua_Integer) >= sizeof(long) || sizeof(lua_Number) <= sizeof(lua_Integer)) {
-    return (long)luaL_checkinteger(L, idx);
-  } else {
-    return (long)luaL_checknumber(L, idx);
-  }
-#else
-  return (long)luaL_checknumber(L, idx);
-#endif
-}
-
-long luaT_tolong(lua_State *L, int idx)
-{
-#if LUA_VERSION_NUM == 503
-  if (sizeof(lua_Integer) >= sizeof(long) || sizeof(lua_Number) <= sizeof(lua_Integer)) {
-    return (long)lua_tointeger(L, idx);
-  } else {
-    return (long)lua_tonumber(L, idx);
-  }
-#else
-  return (long)lua_tonumber(L, idx);
-#endif
-}
-
-void luaT_pushinteger(lua_State *L, ptrdiff_t n)
-{
-#if LUA_VERSION_NUM >= 503
-  /* Only push the value as an integer if it fits in lua_Integer,
-   or if the lua_Number representation will be even worse */
-  if (sizeof(lua_Integer) >= sizeof(ptrdiff_t) || sizeof(lua_Number) <= sizeof(lua_Integer)) {
-    lua_pushinteger(L, n);
-  } else {
-    lua_pushnumber(L, (lua_Number)n);
-  }
-#else
-  lua_pushnumber(L, (lua_Number)n);
-#endif
-}
-
-ptrdiff_t luaT_checkinteger(lua_State *L, int idx)
-{
-#if LUA_VERSION_NUM >= 503
-  if (sizeof(lua_Integer) >= sizeof(ptrdiff_t) || sizeof(lua_Number) <= sizeof(lua_Integer)) {
-    return (ptrdiff_t)luaL_checkinteger(L, idx);
-  } else {
-    return (ptrdiff_t)luaL_checknumber(L, idx);
-  }
-#else
-  return (ptrdiff_t)luaL_checknumber(L, idx);
-#endif
-}
-
-void *luaT_getfieldcheckudata(lua_State *L, int ud, const char *field, const char *tname)
-{
-  void *p;
-  lua_getfield(L, ud, field);
-  if(lua_isnil(L, -1))
-    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
-  p = luaT_toudata(L, -1, tname);
-  if(!p)
-    luaL_error(L, "bad argument #%d (field %s is not a %s)", ud, field, tname);
-  return p;
-}
-
-void *luaT_getfieldchecklightudata(lua_State *L, int ud, const char *field)
-{
-  void *p;
-  lua_getfield(L, ud, field);
-  if(lua_isnil(L, -1))
-    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
-
-  if(!lua_islightuserdata(L, -1))
-    luaL_error(L, "bad argument #%d (field %s is not a light userdata)", ud, field);
-
-  p = lua_touserdata(L, -1);
-
-  return p;
-}
-
-double luaT_getfieldchecknumber(lua_State *L, int ud, const char *field)
-{
-  lua_getfield(L, ud, field);
-  if(lua_isnil(L, -1))
-    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
-  if(!lua_isnumber(L, -1))
-    luaL_error(L, "bad argument #%d (field %s is not a number)", ud, field);
-  return lua_tonumber(L, -1);
-}
-
-int luaT_getfieldcheckint(lua_State *L, int ud, const char *field)
-{
-  lua_getfield(L, ud, field);
-  if(lua_isnil(L, -1))
-    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
-  if(!lua_isnumber(L, -1))
-    luaL_error(L, "bad argument #%d (field %s is not a number)", ud, field);
-  return (int)lua_tonumber(L, -1);
-}
-
-const char* luaT_getfieldcheckstring(lua_State *L, int ud, const char *field)
-{
-  lua_getfield(L, ud, field);
-  if(lua_isnil(L, -1))
-    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
-  if(!lua_isstring(L, -1))
-    luaL_error(L, "bad argument #%d (field %s is not a string)", ud, field);
-  return lua_tostring(L, -1);
-}
-
-int luaT_getfieldcheckboolean(lua_State *L, int ud, const char *field)
-{
-  lua_getfield(L, ud, field);
-  if(lua_isnil(L, -1))
-    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
-  if(!lua_isboolean(L, -1))
-    luaL_error(L, "bad argument #%d (field %s is not a boolean)", ud, field);
-  return lua_toboolean(L, -1);
-}
-
-void luaT_getfieldchecktable(lua_State *L, int ud, const char *field)
-{
-  lua_getfield(L, ud, field);
-  if(lua_isnil(L, -1))
-    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
-  if(!lua_istable(L, -1))
-    luaL_error(L, "bad argument #%d (field %s is not a table)", ud, field);
-}
-
-/**** type checks as in luaL ****/
-int luaT_typerror(lua_State *L, int ud, const char *tname)
-{
-  const char *msg;
-  const char *tnameud = luaT_typename(L, ud);
-
-  if(!tnameud)
-    tnameud = lua_typename(L, ud);
-
-  msg = lua_pushfstring(L, "%s expected, got %s",
-                        tname,
-                        (tnameud ? tnameud : "unknown object"));
-
-  return luaL_argerror(L, ud, msg);
-}
-
-int luaT_checkboolean(lua_State *L, int ud)
-{
-  if(!lua_isboolean(L, ud))
-    luaT_typerror(L, ud, lua_typename(L, LUA_TBOOLEAN));
-  return lua_toboolean(L, ud);
-}
-
-int luaT_optboolean(lua_State *L, int ud, int def)
-{
-  if(lua_isnoneornil(L,ud))
-    return def;
-
-  return luaT_checkboolean(L, ud);
-}
-
-void luaT_registeratname(lua_State *L, const struct luaL_Reg *methods, const char *name)
-{
-  int idx = lua_gettop(L);
-
-  luaL_checktype(L, idx, LUA_TTABLE);
-  lua_pushstring(L, name);
-  lua_rawget(L, idx);
-
-  if(lua_isnil(L, -1))
-  {
-    lua_pop(L, 1);
-    lua_pushstring(L, name);
-    lua_newtable(L);
-    lua_rawset(L, idx);
-
-    lua_pushstring(L, name);
-    lua_rawget(L, idx);
-  }
-
-  luaT_setfuncs(L, methods, 0);
-  lua_pop(L, 1);
-}
-
-
-/* returns the name of the class itself (sans nesting) */
-const char* luaT_classrootname(const char *tname)
-{
-  int idx;
-  int sz = strlen(tname);
-
-  for(idx = sz-1; idx >= 0 ; idx--)
-  {
-    if(tname[idx] == '.')
-      return tname+idx+1;
-  }
-  return tname;
-}
-
-/* parent_name must be a buffer at least as big as tname.
- * If class has a parent, returns true; and, sets
- * parent name to that of full parent hierarchy (e.g.
- * given class `A.b.c`, sets parent_name to `A.b`)
- */
-int luaT_fullparentname(const char *tname, char *parent_name)
-{
-  int sz = strlen(tname);
-  int idx;
-  for(idx = sz-1; idx > 0 ; idx--)
-    if(tname[idx] == '.' || tname[idx] == '\0') break;
-
-  if (idx > 0) strncpy(parent_name, tname, idx);
-  parent_name[idx] = '\0';
-  return tname[idx] == '.';
-}
-
-/* alias for ensuring backwards compatibility;
- * use of luaT_fullparentname is preferred.
- */
-int luaT_classmodulename(const char *tname, char *parent_name)
-{
-  return luaT_fullparentname(tname, parent_name);
-}
-
-/* parent_name must be a buffer at least as big as tname.
- * If class has a parent, returns true; and, sets
- * parent name to that of outermost parent (e.g.
- * given class `A.b.c`, sets parent_name to `A`)
- */
-int luaT_outerparentname(const char *tname, char *parent_name)
-{
-  char chars[] = {'.', '\0'};
-  size_t idx;
-  idx = strcspn(tname, chars);
-  strncpy(parent_name, tname, idx);
-  parent_name[idx] = '\0';
-  return tname[idx] == '.';
-}
-
-/* parent_name must be a buffer at least as big as tname.
- * If class has a parent, returns true; and, sets parent
- * name to that of innermost parent (e.g. given class
- * `A.b.c`, sets parent_name to `b`). In the comments
- * below, the inner parent name is abbreviated as IPN.
- */
-int luaT_innerparentname(const char *tname, char *parent_name)
-{
-  int sz = strlen(tname);
-  int tail, head;
-  for(tail = sz-1; tail >= 0 ; tail--) // tail points to
-    if(tname[tail] == '.') break;      // just past IPN
-
-  if (tail == 0) return 0;
-
-  for(head = tail-1; head >= 0; head--) // head points to
-    if(tname[head] == '.') break;       // just before IPN
-
-  head += 1; // update head to start of IPN
-  tail -= head; // update tail to strlen(IPN)
-  strncpy(parent_name, tname+head, tail);
-  parent_name[tail] = '\0';
-  return 1;
-}
-
-/* Method for pushing a class's immediate parent to the
- * stack (e.g. given class `A.b.c`, pushes `b` to the stack)
- */
-void luaT_getinnerparent(lua_State *L, const char *tname)
-{
-  /* Local variables */
-  char term[256];
-  char chars[] = {'.', '\0'};
-  const char *tname_full = tname; // used for error case
-
-  /* Get outermost table from Lua */
-  int n = strcspn(tname, chars);
-  strncpy(term, tname, n);
-  term[n] = '\0';
-  lua_getglobal(L, term);
-  tname  += n + 1;
-
-  /* Traverse hierarchy down to last table*/
-  n = strcspn(tname, chars);
-  while(n < strlen(tname))
-  {
-    /* Check that current parent is a table (i.e. a module) */
-    if(!lua_istable(L, -1)){
-      strncpy(term, tname_full, tname - tname_full - 1);
-      term[tname - tname_full] = '\0';
-      luaL_error(L, "while creating metatable %s: bad argument #1 (%s is an invalid module name)", tname_full, term);
-    }
-    strncpy(term, tname, n);
-    term[n] = '\0';
-    lua_getfield(L, -1, term);
-    lua_remove(L, -2);
-    tname += n + 1;
-    n = strcspn(tname, chars); // prepare for next
-  }
-
-  /* Check that resulting parent is a table (i.e. a module) */
-  if(!lua_istable(L, -1)){
-    strncpy(term, tname_full, tname - tname_full - 1);
-    term[tname - tname_full] = '\0';
-    luaL_error(L, "while creating metatable %s: bad argument #1 (%s is an invalid module name)", tname_full, term);
-  }
-}
-
-
-int luaT_lua_newmetatable(lua_State *L)
-{
-  /* Local Variables */
-  const char* tname = luaL_checkstring(L, 1);
-  char parent_name[256];
-  int is_in_module = 0;
-
-  /* Argument Checking */
-  lua_settop(L, 6);
-  luaL_argcheck(L, lua_isnoneornil(L, 2) || lua_isstring(L, 2), 2, "parent class name or nil expected");
-  luaL_argcheck(L, lua_isnoneornil(L, 3) || lua_isfunction(L, 3), 3, "constructor function or nil expected");
-  luaL_argcheck(L, lua_isnoneornil(L, 4) || lua_isfunction(L, 4), 4, "destructor function or nil expected");
-  luaL_argcheck(L, lua_isnoneornil(L, 5) || lua_isfunction(L, 5), 5, "factory function or nil expected");
-  luaL_argcheck(L, lua_isnoneornil(L, 6) || lua_istable(L, 6), 6, "module table or nil expected");
-
-  /* Push immediate parent module to stack */
-  if(lua_isnoneornil(L, 6)) {
-    lua_pop(L, 1); /* remove the nil */
-    is_in_module = luaT_fullparentname(tname, parent_name);
-    if (is_in_module)
-      luaT_getinnerparent(L, tname);
-    else
-      lua_pushglobaltable(L);
-  }
-
-  if(!lua_istable(L, -1))
-    luaL_error(L, "while creating metatable %s: bad argument #1 (%s is an invalid module name)", tname, parent_name);
-
-  /* we first create the new metaclass if we have to */
-  if(!luaT_pushmetatable(L, tname))
-  {
-    /* create the metatable */
-    lua_newtable(L);
-
-    /* registry[name] = metatable */
-    lua_pushvalue(L, -1);
-    lua_setfield(L, LUA_REGISTRYINDEX, tname);
-
-    /* registry[metatable] = tname */
-    lua_pushvalue(L, -1);
-    lua_pushstring(L, tname);
-    lua_rawset(L, LUA_REGISTRYINDEX);
-
-    /* __index handling */
-    lua_pushcfunction(L, luaT_mt__index);
-    lua_setfield(L, -2, "__index");
-
-    /* __newindex handling */
-    lua_pushcfunction(L, luaT_mt__newindex);
-    lua_setfield(L, -2, "__newindex");
-
-    /* __typename contains the typename */
-    lua_pushstring(L, tname);
-    lua_setfield(L, -2, "__typename");
-
-    /* __metatable is self */
-    lua_pushvalue(L, -1);
-    lua_setfield(L, -2, "__metatable");
-
-    /* by default, __version equals 1 */
-    lua_pushnumber(L, 1);
-    lua_setfield(L, -2, "__version");
-
-    /* assign default operator functions */
-    lua_pushcfunction(L, luaT_mt__tostring);
-    lua_setfield(L, -2, "__tostring");
-
-    lua_pushcfunction(L, luaT_mt__add);
-    lua_setfield(L, -2, "__add");
-
-    lua_pushcfunction(L, luaT_mt__sub);
-    lua_setfield(L, -2, "__sub");
-
-    lua_pushcfunction(L, luaT_mt__mul);
-    lua_setfield(L, -2, "__mul");
-
-    lua_pushcfunction(L, luaT_mt__div);
-    lua_setfield(L, -2, "__div");
-
-    lua_pushcfunction(L, luaT_mt__mod);
-    lua_setfield(L, -2, "__mod");
-
-    lua_pushcfunction(L, luaT_mt__pow);
-    lua_setfield(L, -2, "__pow");
-
-    lua_pushcfunction(L, luaT_mt__unm);
-    lua_setfield(L, -2, "__unm");
-
-    lua_pushcfunction(L, luaT_mt__concat);
-    lua_setfield(L, -2, "__concat");
-
-    lua_pushcfunction(L, luaT_mt__len);
-    lua_setfield(L, -2, "__len");
-
-    lua_pushcfunction(L, luaT_mt__eq);
-    lua_setfield(L, -2, "__eq");
-
-    lua_pushcfunction(L, luaT_mt__lt);
-    lua_setfield(L, -2, "__lt");
-
-    lua_pushcfunction(L, luaT_mt__le);
-    lua_setfield(L, -2, "__le");
-
-    lua_pushcfunction(L, luaT_mt__call);
-    lua_setfield(L, -2, "__call");
-  }
-
-  /* we assign the parent class if necessary */
-  if(!lua_isnoneornil(L, 2))
-  {
-    if(lua_getmetatable(L, -1))
-      luaL_error(L, "class %s has been already assigned a parent class\n", tname);
-    else
-    {
-      const char* parent_tname = luaL_checkstring(L, 2);
-      if(!luaT_pushmetatable(L, parent_tname))
-        luaL_error(L, "bad argument #2 (invalid parent class name %s)", parent_tname);
-      lua_setmetatable(L, -2);
-    }
-  }
-
-  /* register the destructor function  */
-  if(!lua_isnoneornil(L, 4))
-  {
-    /* does it exists already? */
-    lua_pushstring(L, "__gc");
-    lua_rawget(L, -2);
-
-    if(lua_isnil(L, -1))
-    {
-      lua_pop(L, 1); /* pop nil */
-      lua_pushstring(L, "__gc");
-      lua_pushvalue(L, 4);
-      lua_rawset(L, -3);
-    }
-    else
-      luaL_error(L, "%s has been already assigned a destructor", tname);
-  }
-
-  /* register the factory function  */
-  if(!lua_isnoneornil(L, 5))
-  {
-    /* does it exists already? */
-    lua_pushstring(L, "__factory");
-    lua_rawget(L, -2);
-
-    if(lua_isnil(L, -1))
-    {
-      lua_pop(L, 1); /* pop nil */
-      lua_pushstring(L, "__factory");
-      lua_pushvalue(L, 5);
-      lua_rawset(L, -3);
-    }
-    else
-      luaL_error(L, "%s has been already assigned a factory", tname);
-  }
-
-  /******** Constructor table and metatable ********/
-  lua_pushstring(L, "__constructor");
-  lua_rawget(L, -2);
-  if(lua_isnil(L, -1))
-  {
-    lua_pop(L, 1);                        /* pop nil */
-    lua_newtable(L);                      /* fancy table */
-    lua_newtable(L);                      /* fancy metatable */
-
-    lua_pushvalue(L, -3);                 /* metatable */
-    lua_setfield(L, -2, "__index");       /* so we can get the methods */
-
-    lua_pushcfunction(L, luaT_cmt__newindex);
-    lua_setfield(L, -2, "__newindex");    /* so we add new methods */
-
-    lua_pushcfunction(L, luaT_cmt__call);
-    lua_setfield(L, -2, "__call");        /* so we can create, we are here for only that */
-
-    lua_pushvalue(L, -3);
-    lua_setfield(L, -2, "__metatable");   /* redirect to metatable with methods */
-
-    lua_setmetatable(L, -2);              /* constructor metatable is ... this fancy metatable */
-
-    /* set metatable[__constructor] = constructor-metatable */
-    lua_pushstring(L, "__constructor");
-    lua_pushvalue(L, -2);
-    lua_rawset(L, -4);
-  }
-
-  /* register the constructor function  */
-  if(!lua_isnoneornil(L, 3))
-  {
-    /* get constructor metatable */
-    lua_getmetatable(L, -1);
-
-    /* does it exists already? */
-    lua_pushstring(L, "__new");
-    lua_rawget(L, -2);
-
-    if(lua_isnil(L, -1))
-    {
-      lua_pop(L, 1); /* pop nil */
-      lua_pushstring(L, "__new");
-      lua_pushvalue(L, 3);
-      lua_rawset(L, -3);
-
-      /* set "new" in the metatable too */
-      lua_pushstring(L, "new");
-      lua_pushvalue(L, 3);
-      lua_rawset(L, -5);
-    }
-    else
-      luaL_error(L, "%s has been already assigned a constructor", tname);
-
-    /* pop constructor metatable */
-    lua_pop(L, 1);
-  }
-
-  /* module.name = constructor metatable */
-  lua_setfield(L, 6, luaT_classrootname(tname));
-
-  return 1; /* returns the metatable */
-}
-
-/* Lua only utility functions */
-
-/* add any custom type, provided the object has a metatable */
-int luaT_lua_metatype(lua_State *L)
-{
-  if( (lua_gettop(L) != 2) && (lua_gettop(L) != 3) )
-    luaL_error(L, "expecting: string table [ctype]");
-
-  luaL_checkstring(L, 1);
-  luaL_checktype(L, 2, LUA_TTABLE);
-
-  if(lua_gettop(L) == 3)
-  {
-    if(!luaT_cdataname(L, 3, lua_tostring(L, 1)))
-      luaL_error(L, "could not register cdata type -- missing ffi library?");
-  }
-
-  /* registry[name] = metatable */
-  lua_pushvalue(L, 1);
-  lua_pushvalue(L, 2);
-  lua_rawset(L, LUA_REGISTRYINDEX);
-
-  /* registry[metatable] = tname */
-  lua_pushvalue(L, 2);
-  lua_pushvalue(L, 1);
-  lua_rawset(L, LUA_REGISTRYINDEX);
-
-  return 0;
-}
-
-/* return a userdata from a C pointer */
-/* you are better to know what you are doing */
-int luaT_lua_pushudata(lua_State *L)
-{
-  void *udata = NULL;
-  const char *tname = luaL_checkstring(L, 2);
-
-  if(lua_type(L, 1) == 10)
-    udata = *((void**)lua_topointer(L, 1));
-  else if(luaT_iscdata(L, 1))
-    udata = ((void**)lua_topointer(L, 1))[4];
-  else if(lua_isnumber(L, 1))
-    udata = (void*)(uintptr_t)lua_tonumber(L, 1);
-  else
-    luaL_argerror(L, 1, "expecting number or cdata");
-
-  luaT_pushudata(L, udata, tname);
-
-  return 1;
-}
-
-int luaT_lua_factory(lua_State *L)
-{
-  const char* tname = luaL_checkstring(L, 1);
-  if(luaT_pushmetatable(L, tname) && !lua_isnil(L, -1))
-  {
-    lua_pushstring(L, "__factory");
-    lua_rawget(L, -2);
-  }
-  else
-  {
-    lua_pushnil(L);
-  }
-  return 1;
-}
-
-int luaT_lua_getconstructortable(lua_State *L)
-{
-  const char* tname = luaL_checkstring(L, 1);
-  if(luaT_pushmetatable(L, tname))
-  {
-    lua_pushstring(L, "__constructor");
-    lua_rawget(L, -2);
-    return 1;
-  }
-  return 0;
-}
-
-
-int luaT_lua_typename(lua_State *L)
-{
-  const char* tname = NULL;
-  luaL_checkany(L, 1);
-  if((tname = luaT_typename(L, 1)))
-  {
-    lua_pushstring(L, tname);
-    return 1;
-  }
-  return 0;
-}
-
-int luaT_lua_isequal(lua_State *L)
-{
-  if(lua_isuserdata(L, 1) && lua_isuserdata(L, 2))
-  {
-    void **u1, **u2;
-    luaL_argcheck(L, luaT_typename(L, 1), 1, "Torch object expected");
-    luaL_argcheck(L, luaT_typename(L, 2), 2, "Torch object expected");
-
-    u1 = lua_touserdata(L, 1);
-    u2 = lua_touserdata(L, 2);
-    if(*u1 == *u2)
-      lua_pushboolean(L, 1);
-    else
-      lua_pushboolean(L, 0);
-  }
-  else if(lua_istable(L, 1) && lua_istable(L, 2))
-    lua_pushboolean(L, lua_rawequal(L, 1, 2));
-  else
-    lua_pushboolean(L, 0);
-  return 1;
-}
-
-static void luaT_pushpointer(lua_State *L, const void *ptr)
-{
-#if LUA_VERSION_NUM >= 503
-  // this assumes that lua_Integer is a ptrdiff_t
-  if (sizeof(void *) > sizeof(lua_Integer))
-    luaL_error(L, "Pointer value can't be represented as a Lua integer (an overflow would occur)");
-  lua_pushinteger(L, (uintptr_t)(ptr));
-#else
-  // 2^53 - this assumes that lua_Number is a double
-  if ((uintptr_t)ptr > 9007199254740992LLU)
-    luaL_error(L, "Pointer value can't be represented as a Lua number (an overflow would occur)");
-  lua_pushnumber(L, (uintptr_t)(ptr));
-#endif
-}
-
-int luaT_lua_pointer(lua_State *L)
-{
-  if(lua_type(L, 1) == 10) /* luajit cdata */
-  {
-    /* we want the pointer holded by cdata */
-    /* not the pointer on the cdata object */
-    const void* ptr = *((void**)lua_topointer(L, 1));
-    luaT_pushpointer(L, ptr);
-    return 1;
-  }
-  else if (luaT_iscdata(L, 1)) /* luaffi cdata */
-  {
-    void** ptr = (void**)lua_touserdata(L, 1);
-    luaT_pushpointer(L, ptr[4]);
-    return 1;
-  }
-  else if(lua_isuserdata(L, 1))
-  {
-    void **ptr;
-    luaL_argcheck(L, luaT_typename(L, 1), 1, "Torch object expected");
-    ptr = lua_touserdata(L, 1);
-    luaT_pushpointer(L, *ptr);
-    return 1;
-  }
-  else if(lua_istable(L, 1) || lua_isthread(L, 1) || lua_isfunction(L, 1))
-  {
-    const void* ptr = lua_topointer(L, 1);
-    luaT_pushpointer(L, ptr);
-    return 1;
-  }
-  else if(lua_isstring(L, 1))
-  {
-    const char* ptr = lua_tostring(L, 1);
-    luaT_pushpointer(L, ptr);
-    return 1;
-  }
-  else
-    luaL_error(L, "Torch object, table, thread, cdata or function expected");
-
-  return 0;
-}
-
-int luaT_lua_setenv(lua_State *L)
-{
-  if(!lua_isfunction(L, 1) && !lua_isuserdata(L, 1))
-    luaL_typerror(L, 1, "function or userdata");
-  luaL_checktype(L, 2, LUA_TTABLE);
-  lua_setuservalue(L, 1);
-  return 0;
-}
-
-int luaT_lua_getenv(lua_State *L)
-{
-  if(!lua_isfunction(L, 1) && !lua_isuserdata(L, 1))
-    luaL_typerror(L, 1, "function or userdata");
-  lua_getuservalue(L, 1);
-  if (lua_isnil(L, -1))
-    lua_newtable(L);
-  return 1;
-}
-
-int luaT_lua_getmetatable(lua_State *L)
-{
-  const char *tname = luaL_checkstring(L, 1);
-  if(luaT_pushmetatable(L, tname))
-    return 1;
-  return 0;
-}
-
-int luaT_lua_version(lua_State *L)
-{
-  luaL_checkany(L, 1);
-
-  if(luaT_iscdata(L, 1))
-  {
-    const char *tname = luaT_cdataname(L, 1, NULL);
-    if(tname)
-    {
-      luaT_pushmetatable(L, tname);
-      lua_pushstring(L, "__version");
-      lua_rawget(L, -2);
-      return 1;
-    }
-    return 0;
-  }
-  else if(lua_getmetatable(L, 1))
-  {
-    lua_pushstring(L, "__version");
-    lua_rawget(L, -2);
-    return 1;
-  }
-  return 0;
-}
-
-int luaT_lua_setmetatable(lua_State *L)
-{
-  const char *tname = luaL_checkstring(L, 2);
-  luaL_checktype(L, 1, LUA_TTABLE);
-
-  if(!luaT_pushmetatable(L, tname))
-    luaL_error(L, "unknown typename %s\n", tname);
-  lua_setmetatable(L, 1);
-
-  return 1;
-}
-
-/* metatable operator methods */
-static int luaT_mt__index(lua_State *L)
-{
-  if(!lua_getmetatable(L, 1))
-    luaL_error(L, "critical internal indexing error: no metatable found");
-
-  if(!lua_istable(L, -1))
-    luaL_error(L, "critical internal indexing error: not a metatable");
-
-  /* test for __index__ method first */
-  lua_getfield(L, -1, "__index__");
-  if(!lua_isnil(L, -1))
-  {
-    int result;
-
-    if(!lua_isfunction(L, -1))
-      luaL_error(L, "critical internal indexing error: __index__ is not a function");
-
-    lua_pushvalue(L, 1);
-    lua_pushvalue(L, 2);
-
-    lua_call(L, 2, LUA_MULTRET); /* DEBUG: risque: faut vraiment retourner 1 ou 2 valeurs... */
-
-    result = lua_toboolean(L, -1);
-    lua_pop(L, 1);
-
-    if(result)
-      return 1;
-
-    /* on the stack: 1. the object 2. the value 3. the metatable */
-    /* apparently, __index wants only one element returned */
-    /* return lua_gettop(L)-3; */
-
-  }
-  else
-    lua_pop(L, 1); /* remove nil __index__ on the stack */
-
-  lua_pushvalue(L, 2);
-  lua_gettable(L, -2);
-
-  return 1;
-}
-
-static int luaT_mt__newindex(lua_State *L)
-{
-  if(!lua_getmetatable(L, 1))
-    luaL_error(L, "critical internal indexing error: no metatable found");
-
-  if(!lua_istable(L, -1))
-    luaL_error(L, "critical internal indexing error: not a metatable");
-
-  /* test for __newindex__ method first */
-  lua_getfield(L, -1, "__newindex__");
-  if(!lua_isnil(L, -1))
-  {
-    int result;
-
-    if(!lua_isfunction(L, -1))
-      luaL_error(L, "critical internal indexing error: __newindex__ is not a function");
-
-    lua_pushvalue(L, 1);
-    lua_pushvalue(L, 2);
-    lua_pushvalue(L, 3);
-
-    lua_call(L, 3, 1); /* DEBUG: risque: faut vraiment retourner qqch */
-
-    result = lua_toboolean(L, -1);
-    lua_pop(L, 1);
-
-    if(result)
-      return 0;
-  }
-  else
-    lua_pop(L, 1); /* remove nil __newindex__ on the stack */
-
-  lua_pop(L, 1);    /* pop the metatable */
-  if(lua_istable(L, 1))
-    lua_rawset(L, 1);
-  else
-    luaL_error(L, "the class %s cannot be indexed", luaT_typename(L, 1));
-
-  return 0;
-}
-
-
-#define MT_UNI_OPERATOR_GET_HANDLER(NAME)                               \
-    if(!lua_getmetatable(L, 1))                                         \
-      luaL_error(L, "internal error in __" #NAME ": no metatable");
-
-#define MT_BIN_OPERATOR_GET_HANDLER(NAME)                               \
-    if(!lua_getmetatable(L, 1) && !lua_getmetatable(L,2) )              \
-      luaL_error(L, "internal error in __" #NAME                        \
-              ": no metatable in both operands");
-
-#define MT_DECLARE_OPERATOR_BODY(NAME, NIL_BEHAVIOR)                    \
-                                                                        \
-    lua_getfield(L, -1, "__" #NAME "__");                               \
-    if(lua_isnil(L, -1))                                                \
-    {                                                                   \
-      NIL_BEHAVIOR;                                                     \
-    }                                                                   \
-    else                                                                \
-    {                                                                   \
-      if(lua_isfunction(L, -1))                                         \
-      {                                                                 \
-        lua_insert(L, 1); /* insert function */                         \
-        lua_pop(L, 1); /* remove metatable */                           \
-        lua_call(L, lua_gettop(L)-1, LUA_MULTRET);                      \
-          /* we return the result of the call */                        \
-        return lua_gettop(L);                                           \
-      }                                                                 \
-      /* we return the thing the user left in __tostring__ */           \
-    }                                                                   \
-    return 0;                                                           \
-
-/* note: check dans metatable pour ca, donc necessaire */
-#define MT_DECLARE_OPERATOR(NAME, NIL_BEHAVIOR)                         \
-  int luaT_mt__##NAME(lua_State *L)                                     \
-  {                                                                     \
-    MT_UNI_OPERATOR_GET_HANDLER(NAME)                                   \
-    MT_DECLARE_OPERATOR_BODY(NAME,NIL_BEHAVIOR)                         \
-  }
-
-#define MT_DECLARE_BIN_OPERATOR(NAME, NIL_BEHAVIOR)                     \
-  int luaT_mt__##NAME(lua_State *L)                                     \
-  {                                                                     \
-    MT_BIN_OPERATOR_GET_HANDLER(NAME)                                   \
-    MT_DECLARE_OPERATOR_BODY(NAME,NIL_BEHAVIOR)                         \
-  }
-
-
-#define BIN_OPERATOR_ERROR(NAME)                                        \
-    luaL_error(L, "both %s and %s have no " #NAME " operator",          \
-            luaT_typename(L, 1), luaT_typename(L,2))
-
-MT_DECLARE_BIN_OPERATOR(add,    BIN_OPERATOR_ERROR(addition) )
-MT_DECLARE_BIN_OPERATOR(sub,    BIN_OPERATOR_ERROR(substraction) )
-MT_DECLARE_BIN_OPERATOR(mul,    BIN_OPERATOR_ERROR(multiplication) )
-MT_DECLARE_BIN_OPERATOR(div,    BIN_OPERATOR_ERROR(division) )
-MT_DECLARE_BIN_OPERATOR(mod,    BIN_OPERATOR_ERROR(modulo) )
-MT_DECLARE_BIN_OPERATOR(pow,    BIN_OPERATOR_ERROR(power) )
-MT_DECLARE_BIN_OPERATOR(concat, BIN_OPERATOR_ERROR(concat) )
-MT_DECLARE_BIN_OPERATOR(eq,
-                    lua_settop(L, 2);
-                    lua_pushcfunction(L, luaT_lua_isequal);
-                    lua_insert(L, 1);
-                    lua_call(L, 2, 1);
-                    return 1;)
-MT_DECLARE_BIN_OPERATOR(lt, BIN_OPERATOR_ERROR(less-than) )
-MT_DECLARE_BIN_OPERATOR(le, BIN_OPERATOR_ERROR(less-equal) )
-
-MT_DECLARE_OPERATOR(tostring,
-                    lua_pushstring(L, luaT_typename(L, 1));
-                    return 1;)
-MT_DECLARE_OPERATOR(call, luaL_error(L, "%s has no call operator", luaT_typename(L, 1)))
-MT_DECLARE_OPERATOR(unm, luaL_error(L, "%s has no negation operator", luaT_typename(L, 1)))
-MT_DECLARE_OPERATOR(len, luaL_error(L, "%s has no length operator", luaT_typename(L, 1)))
-
-
-/* constructor metatable methods */
-int luaT_cmt__call(lua_State *L)
-{
-  if(!lua_istable(L, 1))
-    luaL_error(L, "internal error in __call: not a constructor table");
-
-  if(!lua_getmetatable(L, 1))
-    luaL_error(L, "internal error in __call: no metatable available");
-
-  lua_pushstring(L, "__new");
-  lua_rawget(L, -2);
-
-  if(lua_isnil(L, -1))
-    luaL_error(L, "no constructor available");
-
-  lua_remove(L, 1); /* remove constructor atable */
-  lua_insert(L, 1); /* insert constructor */
-  lua_pop(L, 1);    /* remove fancy metatable */
-
-  lua_call(L, lua_gettop(L)-1, LUA_MULTRET);
-  return lua_gettop(L);
-}
-
-int luaT_cmt__newindex(lua_State *L)
-{
-  if(!lua_istable(L, 1))
-    luaL_error(L, "internal error in __newindex: not a constructor table");
-
-  if(!lua_getmetatable(L, 1))
-    luaL_error(L, "internal error in __newindex: no metatable available");
-
-  lua_pushstring(L, "__metatable");
-  lua_rawget(L, -2);
-
-  if(!lua_istable(L, -1))
-    luaL_error(L, "internal error in __newindex: no metaclass available");
-
-  lua_insert(L, 2);
-  lua_pop(L, 1); /* remove the metatable over the constructor table */
-
-  lua_rawset(L, -3);
-
-  return 0;
-}
-
-/******************** deprecated functions ********************/
-int luaT_pushmetaclass(lua_State *L, const char *tname)
-{
-  return luaT_pushmetatable(L, tname);
-}
-
-const char* luaT_id(lua_State *L, int ud)
-{
-  return luaT_typename(L, ud);
-}
-
-const char* luaT_id2typename(lua_State *L, const char *id)
-{
-  return id;
-}
-
-const char* luaT_typename2id(lua_State *L, const char *tname)
-{
-  return luaT_typenameid(L, tname);
-}
-
-int luaT_getmetaclass(lua_State *L, int index)
-{
-  return lua_getmetatable(L, index);
-}
-
-const char* luaT_checktypename2id(lua_State *L, const char *tname)
-{
-  const char* id = luaT_typenameid(L, tname);
-  if(!id)
-    luaL_error(L, "unknown class <%s>", tname);
-  return id;
-}
-
-void luaT_registeratid(lua_State *L, const struct luaL_Reg *methods, const char *id)
-{
-  luaT_registeratname(L, methods, id);
-}
-
-/**************************************************************/
diff --git a/contrib/lua-torch/torch7/lib/luaT/luaT.h b/contrib/lua-torch/torch7/lib/luaT/luaT.h
deleted file mode 100644
index 2479a1dc1..000000000
--- a/contrib/lua-torch/torch7/lib/luaT/luaT.h
+++ /dev/null
@@ -1,135 +0,0 @@
-#ifndef LUAT_UTILS_INC
-#define LUAT_UTILS_INC
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-#include <lua.h>
-#include <lauxlib.h>
-#ifdef __cplusplus
-}
-#endif
-
-#ifndef LUA_EXTERNC
-# ifdef __cplusplus
-#  define LUA_EXTERNC extern "C"
-# else
-#  define LUA_EXTERNC extern
-# endif
-#endif
-
-#if (defined(_MSC_VER) || defined(__MINGW32__))
-# define DLL_EXPORT __declspec(dllexport)
-# define DLL_IMPORT __declspec(dllimport)
-# ifdef luaT_EXPORTS
-#  define LUAT_API LUA_EXTERNC DLL_EXPORT
-# else
-#  define LUAT_API LUA_EXTERNC DLL_IMPORT
-# endif
-#else
-# define DLL_EXPORT
-# define DLL_IMPORT
-# define LUAT_API LUA_EXTERNC
-#endif
-
-#if LUA_VERSION_NUM == 501
-# define lua_pushglobaltable(L) lua_pushvalue(L, LUA_GLOBALSINDEX)
-# define lua_setuservalue lua_setfenv
-# define lua_getuservalue lua_getfenv
-#else
-# define lua_objlen lua_rawlen
-static int luaL_typerror(lua_State *L, int narg, const char *tname)
-{
-  return luaL_error(L, "%s expected, got %s", tname, luaL_typename(L, narg));
-}
-#endif
-
-
-/* C functions */
-
-LUAT_API void* luaT_alloc(lua_State *L, ptrdiff_t size);
-LUAT_API void* luaT_realloc(lua_State *L, void *ptr, ptrdiff_t size);
-LUAT_API void luaT_free(lua_State *L, void *ptr);
-
-LUAT_API void luaT_setfuncs(lua_State *L, const luaL_Reg *l, int nup);
-
-LUAT_API const char* luaT_newlocalmetatable(lua_State *L, const char *tname, const char *parent_tname,
-                                            lua_CFunction constructor, lua_CFunction destructor, lua_CFunction factory, int moduleidx);
-
-LUAT_API const char* luaT_newmetatable(lua_State *L, const char *tname, const char *parenttname,
-                                       lua_CFunction constructor, lua_CFunction destructor, lua_CFunction factory);
-
-LUAT_API int luaT_pushmetatable(lua_State *L, const char *tname);
-
-LUAT_API const char* luaT_typenameid(lua_State *L, const char *tname);
-LUAT_API const char* luaT_typename(lua_State *L, int ud);
-
-LUAT_API void luaT_pushudata(lua_State *L, void *udata, const char *tname);
-LUAT_API void *luaT_toudata(lua_State *L, int ud, const char *tname);
-LUAT_API int luaT_isudata(lua_State *L, int ud, const char *tname);
-LUAT_API void *luaT_checkudata(lua_State *L, int ud, const char *tname);
-
-LUAT_API void luaT_pushlong(lua_State *L, long n);
-LUAT_API long luaT_checklong(lua_State *L, int idx);
-LUAT_API long luaT_tolong(lua_State *L, int idx);
-
-LUAT_API void luaT_pushinteger(lua_State *L, ptrdiff_t n);
-LUAT_API ptrdiff_t luaT_checkinteger(lua_State *L, int idx);
-
-LUAT_API void *luaT_getfieldcheckudata(lua_State *L, int ud, const char *field, const char *tname);
-LUAT_API void *luaT_getfieldchecklightudata(lua_State *L, int ud, const char *field);
-LUAT_API double luaT_getfieldchecknumber(lua_State *L, int ud, const char *field);
-LUAT_API int luaT_getfieldcheckint(lua_State *L, int ud, const char *field);
-LUAT_API const char* luaT_getfieldcheckstring(lua_State *L, int ud, const char *field);
-LUAT_API int luaT_getfieldcheckboolean(lua_State *L, int ud, const char *field);
-LUAT_API void luaT_getfieldchecktable(lua_State *L, int ud, const char *field);
-
-LUAT_API int luaT_typerror(lua_State *L, int ud, const char *tname);
-
-LUAT_API int luaT_checkboolean(lua_State *L, int ud);
-LUAT_API int luaT_optboolean(lua_State *L, int ud, int def);
-
-LUAT_API void luaT_registeratname(lua_State *L, const struct luaL_Reg *methods, const char *name);
-
-/* utility functions */
-LUAT_API const char *luaT_classrootname(const char *tname);
-LUAT_API int luaT_classmodulename(const char *tname, char *module_name);
-
-/* debug */
-LUAT_API void luaT_stackdump(lua_State *L);
-
-/* Lua functions */
-LUAT_API int luaT_lua_newmetatable(lua_State *L);
-LUAT_API int luaT_lua_factory(lua_State *L);
-LUAT_API int luaT_lua_getconstructortable(lua_State *L);
-LUAT_API int luaT_lua_typename(lua_State *L);
-LUAT_API int luaT_lua_isequal(lua_State *L);
-LUAT_API int luaT_lua_pointer(lua_State *L);
-LUAT_API int luaT_lua_setenv(lua_State *L);
-LUAT_API int luaT_lua_getenv(lua_State *L);
-LUAT_API int luaT_lua_getmetatable(lua_State *L);
-LUAT_API int luaT_lua_version(lua_State *L);
-LUAT_API int luaT_lua_setmetatable(lua_State *L);
-LUAT_API int luaT_lua_metatype(lua_State *L);
-LUAT_API int luaT_lua_pushudata(lua_State *L);
-
-/* deprecated functions */
-/* ids have been replaced by string names to identify classes */
-/* comments show what function (that you should use) they call now */
-#if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
-#define LUAT_DEPRECATED  __attribute__((__deprecated__))
-#elif (defined(_MSC_VER) || defined(__MINGW32__))
-#define LUAT_DEPRECATED __declspec(deprecated)
-#else
-#define LUAT_DEPRECATED
-#endif
-
-LUAT_API LUAT_DEPRECATED int luaT_pushmetaclass(lua_State *L, const char *tname); /* same as luaT_pushmetatable */
-LUAT_API LUAT_DEPRECATED const char* luaT_id(lua_State *L, int ud); /* same as luaT_typename */
-LUAT_API LUAT_DEPRECATED const char* luaT_id2typename(lua_State *L, const char *id); /*  same as luaT_typenameid */
-LUAT_API LUAT_DEPRECATED const char* luaT_typename2id(lua_State *L, const char*); /* same as luaT_typenameid */
-LUAT_API LUAT_DEPRECATED int luaT_getmetaclass(lua_State *L, int index); /* same as luaT_getmetatable */
-LUAT_API LUAT_DEPRECATED const char* luaT_checktypename2id(lua_State *L, const char *tname);  /* same as luaT_typenameid */
-LUAT_API LUAT_DEPRECATED void luaT_registeratid(lua_State *L, const struct luaL_Reg *methods, const char *id); /* same as luaT_registeratname */
-
-#endif
diff --git a/contrib/lua-torch/torch7/lib/luaT/luaTConfig.cmake.in b/contrib/lua-torch/torch7/lib/luaT/luaTConfig.cmake.in
deleted file mode 100644
index bfb20b87a..000000000
--- a/contrib/lua-torch/torch7/lib/luaT/luaTConfig.cmake.in
+++ /dev/null
@@ -1,9 +0,0 @@
-# Find the luaT includes and library
-#
-# LUAT_INCLUDE_DIR -- where to find the includes
-# LUAT_LIBRARIES -- list of libraries to link against
-# LUAT_FOUND -- set to 1 if found
-
-SET(LUAT_FOUND 1)
-SET(LUAT_INCLUDE_DIR "@LUAT_INCLUDE_DIR@")
-SET(LUAT_LIBRARIES "@LUAT_LIBRARIES@")
author	Vsevolod Stakhov <vsevolod@highsecure.ru>	2019-07-01 15:13:04 +0100
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>	2019-07-01 15:13:04 +0100
commit	891b250b452f8e1963a99931f241ac75e34d0281 (patch)
tree	ab56b822aca3cc6d02a3c9afbe8ca2f6d1c0381f /contrib/lua-torch/torch7/lib
parent	38691d998d019ac0fba95720c337e3f9badf55c4 (diff)
download	rspamd-891b250b452f8e1963a99931f241ac75e34d0281.tar.gz rspamd-891b250b452f8e1963a99931f241ac75e34d0281.zip