#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/THTensorConv.c"
#else

/*
  2D Input, 2D kernel  : convolve given image with the given kernel.
*/
void THTensor_(validXCorr2Dptr)(real *r_,
                                       real alpha,
                                       real *t_, long ir, long ic,
                                       real *k_, long kr, long kc,
                                       long sr, long sc)
{
  long or = (ir - kr) / sr + 1;
  long oc = (ic - kc) / sc + 1;

  long xx, yy, kx, ky;

  if ((sc != 1) || (oc < 4))  {
    /* regular convolution */
    for(yy = 0; yy < or; yy++) {
      for(xx = 0; xx < oc; xx++) {
        /* Dot product in two dimensions... (between input image and the mask) */
        real *pi_ = t_ + yy*sr*ic + xx*sc;
        real *pw_ = k_;
        real sum = 0;
        for(ky = 0; ky < kr; ky++) {
          for(kx = 0; kx < kc; kx++) {
            sum += pi_[kx]*pw_[kx];
          }
          pi_ += ic; /* next input line */
          pw_ += kc; /* next mask line */
        }
        /* Update output */
        *r_++ += alpha*sum;
      }
    }

  } else {
    /* SSE-based convolution */
    for(yy = 0; yy < or; yy++) {
      real *pi_ = t_ + yy*sr*ic;
      real *pw_ = k_;
      for (ky = 0; ky < kr; ky++) {
        real *pis_ = pi_;
        for (kx = 0; kx < kc; kx++) {
          THVector_(cadd)(r_, r_, pis_, alpha*pw_[kx], oc);
          pis_++;
        }
        pi_ += ic; /* next input line */
        pw_ += kc; /* next mask line */
      }
      r_ += oc;
    }
  }
}

/*
  2D Input, 2D kernel  : convolve given image with the given kernel.
*/
void THTensor_(validConv2Dptr)(real *r_,
                                      real alpha,
                                      real *t_, long ir, long ic,
                                      real *k_, long kr, long kc,
                                      long sr, long sc)
{
  long or = (ir - kr) / sr + 1;
  long oc = (ic - kc) / sc + 1;

  long xx, yy, kx, ky;

  if ((sc != 1) || (oc < 4))  {
    /* regular convolution */
    for(yy = 0; yy < or; yy++) {
      for(xx = 0; xx < oc; xx++) {
        /* Dot product in two dimensions... (between input image and the mask) */
        real *pi_ = t_ + yy*sr*ic + xx*sc;
        real *pw_ = k_ + kr*kc - 1;
        real sum = 0;
        for(ky = 0; ky < kr; ky++) {
          for(kx = 0; kx < kc; kx++) {
            sum += pi_[kx]*pw_[-kx];
          }
          pi_ += ic; /* next input line */
          pw_ -= kc; /* next mask line */
        }
        /* Update output */
        *r_++ += alpha*sum;
      }
    }

  } else {
    /* SSE-based convolution */
    for(yy = 0; yy < or; yy++) {
      real *pw_ = k_ + kr*kc - 1;
      real *pi_ = t_ + yy*sr*ic;
      for (ky = 0; ky < kr; ky++) {
        real *pis_ = pi_;
        for (kx = 0; kx < kc; kx++) {
          THVector_(cadd)(r_, r_, pis_, alpha*pw_[-kx], oc);
          pis_++;
        }
        pi_ += ic; /* next input line */
        pw_ -= kc; /* next mask line */
      }
      r_ += oc;
    }
  }
}

/*
  2D Input, 2D kernel  : convolve given image with the given kernel, full convolution.
*/
void THTensor_(fullConv2Dptr)(real *r_,
                                     real alpha,
                                     real *t_, long ir, long ic,
                                     real *k_, long kr, long kc,
                                     long sr, long sc)
{
  long oc = (ic - 1) * sc + kc;

  long xx, yy, kx, ky;

  if ((sc != 1) || (ic < 4))  {
    /* regular convolution */
    for(yy = 0; yy < ir; yy++) {
      for(xx = 0; xx < ic; xx++) {
        /* Outer product in two dimensions... (between input image and the mask) */
        real *po_ = r_ + yy*sr*oc + xx*sc;
        real *pw_ = k_;
        for(ky = 0; ky < kr; ky++)
        {
          real z = *t_ * alpha;
          for(kx = 0; kx < kc; kx++) {
            po_[kx] += z * pw_[kx];
          }
          po_ += oc; /* next input line */
          pw_ += kc; /* next mask line */
        }
        t_++;
      }
    }

  } else {
    /* SSE-based convolution */
    for(yy = 0; yy < ir; yy++) {
      real *po_ = r_ + yy*sr*oc;
      real *pw_ = k_;
      for (ky = 0; ky < kr; ky++) {
        real *pos_ = po_;
        for (kx = 0; kx < kc; kx++) {
          THVector_(cadd)(pos_, pos_, t_, alpha*pw_[kx], ic);
          pos_++;
        }
        po_ += oc; /* next input line */
        pw_ += kc; /* next mask line */
      }
      t_ += ic;
    }
  }
}

/*
  2D Input, 2D kernel  : convolve given image with the given kernel, full convolution.
*/
void THTensor_(fullXCorr2Dptr)(real *r_,
                                      real alpha,
                                      real *t_, long ir, long ic,
                                      real *k_, long kr, long kc,
                                      long sr, long sc)
{
  long oc = (ic - 1) * sc + kc;

  long xx, yy, kx, ky;

  if ((sc != 1) || (ic < 4))  {
    /* regular convolution */
    for(yy = 0; yy < ir; yy++) {
      for(xx = 0; xx < ic; xx++) {
        /* Outer product in two dimensions... (between input image and the mask) */
        real *po_ = r_ + yy*sr*oc + xx*sc;
        real *pw_ = k_ + kr*kc -1;
        long kx, ky;
        for(ky = 0; ky < kr; ky++)
        {
          real z = *t_ * alpha;
          for(kx = 0; kx < kc; kx++) {
            po_[kx] += z * pw_[-kx];
          }
          po_ += oc; /* next input line */
          pw_ -= kc; /* next mask line */
        }
        t_++;
      }
    }

  } else {
    /* SSE-based convolution */
    for(yy = 0; yy < ir; yy++) {
      real *po_ = r_ + yy*sr*oc;
      real *pw_ = k_ + kr*kc -1;
      for (ky = 0; ky < kr; ky++) {
        real *pos_ = po_;
        for (kx = 0; kx < kc; kx++) {
          THVector_(cadd)(pos_, pos_, t_, pw_[-kx]*alpha, ic);
          pos_++;
        }
        po_ += oc; /* next input line */
        pw_ -= kc; /* next mask line */
      }
      t_ += ic;
    }
  }
}

/*
  2D Input, 2D kernel  : convolve given image with the given kernel, valid convolution.
  for sr,sc=1 this is equivalent to validXCorr2Dptr, but otherwise it is useful for
  calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
*/
void THTensor_(validXCorr2DRevptr)(real *r_,
                                          real alpha,
                                          real *t_, long ir, long ic,
                                          real *k_, long kr, long kc,
                                          long sr, long sc)
{
  long or = ir - (kr - 1) * sr;
  long oc = ic - (kc - 1) * sc;

  long xx, yy, kx, ky;

  if ((sc != 1) || (kc < 4))  {
    /* regular convolution */
    for(yy = 0; yy < kr; yy++) {
      for(xx = 0; xx < kc; xx++) {
        real *po_ = r_;
        real *pi_ = t_ + yy*sr*ic + xx*sc;
        real z = *k_++ * alpha;

        for(ky = 0; ky < or; ky++) {
          for(kx = 0; kx < oc; kx++)
            po_[kx] += z * pi_[kx];
          pi_ += ic;
          po_ += oc;
        }
      }
    }

  } else {
    /* SSE-based convolution */
    for(yy = 0; yy < kr; yy++) {
      for(xx = 0; xx < kc; xx++) {
        real *po_ = r_;
        real *pi_ = t_ + yy*sr*ic + xx*sc;
        real z = *k_++ * alpha;

        for(ky = 0; ky < or; ky++) {
          THVector_(cadd)(po_, po_, pi_, z, oc);
          pi_ += ic;
          po_ += oc;
        }
      }
    }
  }
}
/*
  3D Input, 3D kernel  : convolve given volume with the given kernel.
*/
void THTensor_(validXCorr3Dptr)(real *r_,
                                       real alpha,
                                       real *t_, long it, long ir, long ic,
                                       real *k_, long kt, long kr, long kc,
                                       long st, long sr, long sc)
{
  long ot = (it - kt) / st + 1;
  long or = (ir - kr) / sr + 1;
  long oc = (ic - kc) / sc + 1;

  long zz, xx, yy;

  for (zz = 0; zz < ot; zz++)
  {
    for(yy = 0; yy < or; yy++)
    {
      for(xx = 0; xx < oc; xx++)
      {
        /* Dot product in two dimensions... (between input image and the mask) */
        real *pi_ = t_ + zz*st*ir*ic + yy*sr*ic + xx*sc;
        real *pw_ = k_;
        real sum = 0;
        long kz, kx, ky;
        for(kz = 0; kz < kt; kz++)
        {
          for(ky = 0; ky < kr; ky++)
          {
            for(kx = 0; kx < kc; kx++) {
              sum += pi_[kx]*pw_[kx];
            }
            pi_ += ic; /* next input line */
            pw_ += kc; /* next mask line */
          }
          pi_ += (ir-kr)*ic; /* next input slice */
        }
        /* Update output */
        *r_++ += sum*alpha;
      }
    }
  }
}

/*
  3D Input, 3D kernel  : convolve given volume with the given kernel.
*/
void THTensor_(validConv3Dptr)(real *r_,
                                      real alpha,
                                      real *t_, long it, long ir, long ic,
                                      real *k_, long kt, long kr, long kc,
                                      long st, long sr, long sc)
{
  long ot = (it - kt) / st + 1;
  long or = (ir - kr) / sr + 1;
  long oc = (ic - kc) / sc + 1;

  long zz, xx, yy;

  for(zz = 0; zz < ot; zz++)
  {
    for(yy = 0; yy < or; yy++)
    {
      for(xx = 0; xx < oc; xx++)
      {
        /* Dot product in two dimensions... (between input image and the mask) */
        real *pi_ = t_ + zz*st*ir*ic + yy*sr*ic + xx*sc;
        real *pw_ = k_ + kt*kr*kc - 1;
        real sum = 0;
        long kz, kx, ky;
        for(kz = 0; kz < kt; kz++)
        {
          for(ky = 0; ky < kr; ky++)
          {
            for(kx = 0; kx < kc; kx++) {
              sum += pi_[kx]*pw_[-kx];
            }
            pi_ += ic; /* next input line */
            pw_ -= kc; /* next mask line */
          }
          pi_ += (ir-kr)*ic; /* next input slice */
        }
        /* Update output */
        *r_++ += alpha*sum;
      }
    }
  }
}


/*
  3D Input, 3D kernel  : convolve given volume with the given kernel, full convolution.
*/
void THTensor_(fullConv3Dptr)(real *r_,
                                     real alpha,
                                     real *t_, long it, long ir, long ic,
                                     real *k_, long kt, long kr, long kc,
                                     long st, long sr, long sc)
{
  long or = (ir - 1) * sr + kr;
  long oc = (ic - 1) * sc + kc;

  long zz, xx, yy;

  for(zz = 0; zz < it; zz++)
  {
    for(yy = 0; yy < ir; yy++)
    {
      for(xx = 0; xx < ic; xx++)
      {
        /* Outer product in two dimensions... (between input image and the mask) */
        real *po_ = r_ + zz*st*or*oc + yy*sr*oc + xx*sc;
        real *pw_ = k_;
        long kz, kx, ky;
        /* printf("Output Plane : %ld,%ld,%ld, input val=%g\n",zz,yy,xx,*t_); */
        for(kz = 0; kz < kt; kz++)
        {
          for(ky = 0; ky < kr; ky++)
          {
            real z = *t_ * alpha;
            for(kx = 0; kx < kc; kx++) {
              /* printf("o=%g,k=%g," , po_[kx],pw_[kx]); */
              po_[kx] += z * pw_[kx];
              /* printf("o=%g " , po_[kx]); */
            }
            /* printf("\n"); */
            po_ += oc; /* next input line */
            pw_ += kc; /* next mask line */
          }
          po_ += (or-kr)*oc; /* next output slice */
          /* printf("\n"); */
        }
        t_++;
      }
    }
  }
}

/*
  3D Input, 3D kernel  : convolve given volume with the given kernel, full convolution.
*/
void THTensor_(fullXCorr3Dptr)(real *r_,
                                      real alpha,
                                      real *t_, long it, long ir, long ic,
                                      real *k_, long kt, long kr, long kc,
                                      long st, long sr, long sc)
{
  long or = (ir - 1) * sr + kr;
  long oc = (ic - 1) * sc + kc;

  long zz, xx, yy;

  for(zz = 0; zz < it; zz++)
  {
    for(yy = 0; yy < ir; yy++)
    {
      for(xx = 0; xx < ic; xx++)
      {
        /* Outer product in two dimensions... (between input image and the mask) */
        real *po_ = r_ + zz*st*or*oc + yy*sr*oc + xx*sc;
        real *pw_ = k_ + kt*kr*kc -1;
        long kz, kx, ky;
        for(kz = 0; kz < kt; kz++)
        {
          for(ky = 0; ky < kr; ky++)
          {
            real z = *t_ * alpha;
            for(kx = 0; kx < kc; kx++) {
              po_[kx] += z * pw_[-kx];
            }
            po_ += oc; /* next input line */
            pw_ -= kc; /* next mask line */
          }
          po_ += (or-kr)*oc; /* next output slice */
        }
        t_++;
      }
    }
  }
}

/*
  3D Input, 3D kernel  : convolve given image with the given kernel, valid convolution.
  for sr,sc=1 this is equivalent to validXCorr3Dptr, but otherwise it is useful for
  calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
*/
void THTensor_(validXCorr3DRevptr)(real *r_,
                                          real alpha,
                                          real *t_, long it, long ir, long ic,
                                          real *k_, long kt, long kr, long kc,
                                          long st, long sr, long sc)
{
  long ot = it - (kt - 1) * st;
  long or = ir - (kr - 1) * sr;
  long oc = ic - (kc - 1) * sc;

  long zz, xx, yy;
  for(zz = 0; zz < kt; zz++)
  {
    for(yy = 0; yy < kr; yy++)
    {
      for(xx = 0; xx < kc; xx++)
      {
        real *po_ = r_;
        real *pi_ = t_ + zz*st*ir*ic + yy*sr*ic + xx*sc;
        real z = *k_++ * alpha;
        long kz, kx, ky;
        for(kz = 0; kz < ot; kz++)
        {
          for(ky = 0; ky < or; ky++)
          {
            for(kx = 0; kx < oc; kx++)
              po_[kx] += z * pi_[kx];
            pi_ += ic;
            po_ += oc;
          }
          pi_ += (ir-or)*ic; /* next input slice */
        }
      }
    }
  }
}

void THTensor_(conv2d)(real* output_data,
                       real alpha,
                       real* ptr_input, long nInputRows, long nInputCols,
                       real* ptr_weight, long nKernelRows, long nKernelCols,
                       long srow, long scol,
                       const char *vf, const char *xc)
{
  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can be 'V' or 'F'");
  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can be 'X' or 'C'");
  if (*vf == 'F')
    if (*xc == 'X')
      THTensor_(fullXCorr2Dptr)(output_data,
                                alpha,
                                ptr_input,  nInputRows,  nInputCols,
                                ptr_weight, nKernelRows, nKernelCols,
                                srow, scol);
    else
      THTensor_(fullConv2Dptr)(output_data,
                               alpha,
                               ptr_input,  nInputRows,  nInputCols,
                               ptr_weight, nKernelRows, nKernelCols,
                               srow, scol);
  else
    if (*xc == 'X')
      THTensor_(validXCorr2Dptr)(output_data,
                                 alpha,
                                 ptr_input,  nInputRows,  nInputCols,
                                 ptr_weight, nKernelRows, nKernelCols,
                                 srow, scol);
    else
      THTensor_(validConv2Dptr)(output_data,
                                alpha,
                                ptr_input,  nInputRows,  nInputCols,
                                ptr_weight, nKernelRows, nKernelCols,
                                srow, scol);
}

void THTensor_(conv3d)(real* output_data,
                       real alpha,
                       real* ptr_input, long nInputDepth, long nInputRows, long nInputCols,
                       real* ptr_weight, long nKernelDepth, long nKernelRows, long nKernelCols,
                       long sdepth, long srow, long scol,
                       const char *vf, const char *xc)
{
  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can be 'V' or 'F'");
  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can be 'X' or 'C'");
  if (*vf == 'F')
    if (*xc == 'X')
      THTensor_(fullXCorr3Dptr)(output_data,
                                alpha,
                                ptr_input, nInputDepth, nInputRows,  nInputCols,
                                ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
                                sdepth, srow, scol);
    else
      THTensor_(fullConv3Dptr)(output_data,
                               alpha,
                               ptr_input, nInputDepth, nInputRows,  nInputCols,
                               ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
                               sdepth, srow, scol);
  else
    if (*xc == 'X')
      THTensor_(validXCorr3Dptr)(output_data,
                                 alpha,
                                 ptr_input, nInputDepth, nInputRows,  nInputCols,
                                 ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
                                 sdepth, srow, scol);
    else
      THTensor_(validConv3Dptr)(output_data,
                                alpha,
                                ptr_input, nInputDepth, nInputRows,  nInputCols,
                                ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
                                sdepth, srow, scol);
}

long THTensor_(convsize)(long x, long k, long s, const char* vf)
{
  THArgCheck(*vf == 'V' || *vf == 'F', 1, "type of convolution can be 'V' or 'F'");
  if (*vf == 'V')
    return (x-k)/s + 1;
  else
    return (x-1)*s + k;
}


/*
  3D input, 3D kernel, 4D output
  like rank1 update
  A <- xx' + beta*A
  for sr,sc=1 this is equivalent to conv2Dger, but otherwise it is useful for
  calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
*/
void THTensor_(conv2DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol)
{
  long nInputPlane, nInputRows, nInputCols;
  long nKernelPlane, nKernelRows, nKernelCols;
  long nOutputPlane, nOutputRows, nOutputCols;
  long istride0, kstride0;
  THTensor *input;
  THTensor *kernel;
  real *input_data;
  real *weight_data;
  real *output_data;
  ptrdiff_t nelem;
  long k;

  THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
  THArgCheck(k_->nDimension == 3 , 4, "kernel: 3D Tensor expected");
  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");

  input = THTensor_(newContiguous)(t_);
  kernel = THTensor_(newContiguous)(k_);

  nInputPlane = input->size[0];
  istride0    = input->stride[0];
  nInputRows  = input->size[1];
  nInputCols  = input->size[2];

  kstride0 = kernel->stride[0];
  nKernelPlane = kernel->size[0];
  nKernelRows = kernel->size[1];
  nKernelCols = kernel->size[2];
  nOutputPlane = nInputPlane * kernel->size[0];

  THArgCheck(nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, "covn2DRevger : Input image is smaller than kernel");

  nOutputRows = nInputRows - (nKernelRows - 1) * srow;
  nOutputCols = nInputCols - (nKernelCols - 1) * scol;

  nelem = THTensor_(nElement)(r_);
  THTensor_(resize4d)(r_,nKernelPlane, nInputPlane, nOutputRows, nOutputCols);

  input_data = THTensor_(data)(input);
  weight_data = THTensor_(data)(kernel);
  output_data = THTensor_(data)(r_);

  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
  {
    /*THTensor_(zero)(r_);*/

#pragma omp parallel for private(k)
    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
    {
      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
      long l;
      for (l = 0; l < nOutputRows*nOutputCols; l++)
        ptr_output[l] = 0.0;
    }
  }
  else if (beta != 1)
  {
    /*THTensor_(mul)(r_, beta);*/
#pragma omp parallel for private(k)
    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
    {
      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
      long l;
      for (l = 0; l < nOutputRows*nOutputCols; l++)
        ptr_output[l] *= beta;
    }
  }

#pragma omp parallel for private(k)
  for(k = 0; k < nKernelPlane; k++)
  {
    long i;
    /* get kernel */
    real *ptr_weight = weight_data+k*kstride0;

    for(i = 0; i < nInputPlane; i++)
    {
      /* get output */
      real *ptr_output = output_data + k*nInputPlane*nOutputCols*nOutputRows + i*nOutputCols*nOutputRows;
      /* get input */
      real *ptr_input = input_data+i*istride0;

      /* do image, kernel convolution */
      THTensor_(validXCorr2DRevptr)(ptr_output,
                                    alpha,
                                    ptr_input,  nInputRows,  nInputCols,
                                    ptr_weight, nKernelRows, nKernelCols,
                                    srow, scol);
      /* Next output plane */
      /* output_data += nOutputCols*nOutputRows; */
    }
  }
  THTensor_(free)(input);
  THTensor_(free)(kernel);
}


/*
  3D input, 3D kernel, 4D output
  like rank1 update
  A <- xx' + beta*A
  for sr,sc=1 this is equivalent to conv2Dger, but otherwise it is useful for
  calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
*/
void THTensor_(conv2DRevgerm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol)
{
  long nbatch, nInputPlane, nInputRows, nInputCols;
  long nKernelPlane, nKernelRows, nKernelCols;
  long nOutputRows, nOutputCols;
  long istride0, kstride0, istride1, kstride1;
  THTensor *input;
  THTensor *kernel;
  real *input_data;
  real *weight_data;
  real *output_data;
  ptrdiff_t nelem;
  long k;

  THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
  THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");

  input = THTensor_(newContiguous)(t_);
  kernel = THTensor_(newContiguous)(k_);

  istride0    = input->stride[0];
  istride1    = input->stride[1];
  nbatch      = input->size[0];
  nInputPlane = input->size[1];
  nInputRows  = input->size[2];
  nInputCols  = input->size[3];

  kstride0 = kernel->stride[0];
  kstride1 = kernel->stride[1];
  nKernelPlane = kernel->size[1];
  nKernelRows = kernel->size[2];
  nKernelCols = kernel->size[3];

  THArgCheck(nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, "conv2DRevger : Input image is smaller than kernel");
  THArgCheck(kernel->size[0] == input->size[0] , 2, "conv2DRevger : Input batch and kernel batch is not same size");

  nOutputRows = nInputRows - (nKernelRows - 1) * srow;
  nOutputCols = nInputCols - (nKernelCols - 1) * scol;

  nelem = THTensor_(nElement)(r_);
  THTensor_(resize4d)(r_,nKernelPlane, nInputPlane, nOutputRows, nOutputCols);

  input_data = THTensor_(data)(input);
  weight_data = THTensor_(data)(kernel);
  output_data = THTensor_(data)(r_);

  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
  {
    /*THTensor_(zero)(r_);*/

#pragma omp parallel for private(k)
    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
    {
      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
      long l;
      for (l = 0; l < nOutputRows*nOutputCols; l++)
        ptr_output[l] = 0.0;
    }
  }
  else if (beta != 1)
  {
    /*THTensor_(mul)(r_, beta);*/
#pragma omp parallel for private(k)
    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
    {
      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
      long l;
      for (l = 0; l < nOutputRows*nOutputCols; l++)
        ptr_output[l] *= beta;
    }
  }

#pragma omp parallel for private(k)
  for(k = 0; k < nKernelPlane; k++)
  {
    long i;
    for(i = 0; i < nInputPlane; i++)
    {
      long p;
      for(p = 0; p < nbatch; p++)
      {
        /* get kernel */
        real *ptr_weight = weight_data + p*kstride0 + k*kstride1;
        /* get output */
        real *ptr_output = output_data + k*nInputPlane*nOutputCols*nOutputRows + i*nOutputCols*nOutputRows;
        /* get input */
        real *ptr_input = input_data + p*istride0 + i*istride1;

        /* do image, kernel convolution */
        THTensor_(validXCorr2DRevptr)(ptr_output,
                                      alpha,
                                      ptr_input,  nInputRows,  nInputCols,
                                      ptr_weight, nKernelRows, nKernelCols,
                                      srow, scol);
        /* Next output plane */
        /* output_data += nOutputCols*nOutputRows; */
      }
    }
  }
  THTensor_(free)(input);
  THTensor_(free)(kernel);
}


/*
  3D input, 3D kernel, 4D output
  like rank1 update
  A <- xx' + beta*A
*/
void THTensor_(conv2Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc)
{
  long nInputPlane, nInputRows, nInputCols;
  long nKernelPlane, nKernelRows, nKernelCols;
  long nOutputPlane, nOutputRows, nOutputCols;
  long istride0, kstride0;

  THTensor *input;
  THTensor *kernel;
  real *input_data;
  real *weight_data;
  real *output_data;
  ptrdiff_t nelem;
  long k;

  THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
  THArgCheck(k_->nDimension == 3 , 4, "kernel: 3D Tensor expected");
  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'");
  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'");

  input = THTensor_(newContiguous)(t_);
  kernel = THTensor_(newContiguous)(k_);

  nInputPlane = input->size[0];
  istride0    = input->stride[0];
  nInputRows  = input->size[1];
  nInputCols  = input->size[2];

  kstride0 = kernel->stride[0];
  nKernelPlane = kernel->size[0];
  nKernelRows = kernel->size[1];
  nKernelCols = kernel->size[2];
  nOutputPlane = nInputPlane * kernel->size[0];

  THArgCheck((nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dger : Input image is smaller than kernel");

  if (*vf == 'F') {
    nOutputRows = (nInputRows - 1) * srow + nKernelRows;
    nOutputCols = (nInputCols - 1) * scol + nKernelCols;
  } else { /* valid */
    nOutputRows = (nInputRows - nKernelRows) / srow + 1;
    nOutputCols = (nInputCols - nKernelCols) / scol + 1;
  }

  nelem = THTensor_(nElement)(r_);
  THTensor_(resize4d)(r_, nKernelPlane, nInputPlane, nOutputRows, nOutputCols);

  input_data = THTensor_(data)(input);
  weight_data = THTensor_(data)(kernel);
  output_data = THTensor_(data)(r_);

  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
  {
    /*THTensor_(zero)(r_);*/
#pragma omp parallel for private(k)
    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
    {
      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
      long l;
      for (l = 0; l < nOutputRows*nOutputCols; l++)
        ptr_output[l] = 0.0;
    }
  }
  else if (beta != 1)
  {
    /*THTensor_(mul)(r_, beta);*/
#pragma omp parallel for private(k)
    for (k = 0; k < r_->size[0]*r_->size[1]; k++)
    {
      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
      long l;
      for (l = 0; l < nOutputRows*nOutputCols; l++)
        ptr_output[l] *= beta;
    }
  }

#pragma omp parallel for private(k)
  for(k = 0; k < nKernelPlane; k++)
  {
    long i;
    /* get kernel */
    real *ptr_weight = weight_data+k*kstride0;

    for(i = 0; i < nInputPlane; i++)
    {
      /* get output */
      real *ptr_output = output_data + k*nInputPlane*nOutputCols*nOutputRows + i*nOutputCols*nOutputRows;
      /* get input */
      real *ptr_input = input_data+i*istride0;

      /* do image, kernel convolution */
      if (*vf == 'F')
        if (*xc == 'X')
          THTensor_(fullXCorr2Dptr)(ptr_output,
                                    alpha,
                                    ptr_input,  nInputRows,  nInputCols,
                                    ptr_weight, nKernelRows, nKernelCols,
                                    srow, scol);
        else
          THTensor_(fullConv2Dptr)(ptr_output,
                                   alpha,
                                   ptr_input,  nInputRows,  nInputCols,
                                   ptr_weight, nKernelRows, nKernelCols,
                                   srow, scol);
      else
        if (*xc == 'X')
          THTensor_(validXCorr2Dptr)(ptr_output,
                                     alpha,
                                     ptr_input,  nInputRows,  nInputCols,
                                     ptr_weight, nKernelRows, nKernelCols,
                                     srow, scol);
        else
          THTensor_(validConv2Dptr)(ptr_output,
                                    alpha,
                                    ptr_input,  nInputRows,  nInputCols,
                                    ptr_weight, nKernelRows, nKernelCols,
                                    srow, scol);
      /* Next output plane */
      /* output_data += nOutputCols*nOutputRows; */
    }
  }
  THTensor_(free)(input);
  THTensor_(free)(kernel);
}


/*
  3D input, 4D kernel, 3D output
  matrix vector product like
  y <- Ax + beta*y
*/
void THTensor_(conv2Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc)
{
  long nInputPlane, nInputRows, nInputCols;
  long nKernelRows, nKernelCols;
  long nOutputPlane, nOutputRows, nOutputCols;
  long istride0, kstride0, kstride1;
  THTensor *input;
  THTensor* kernel;
  real *input_data;
  real *weight_data;
  real *output_data;
  ptrdiff_t nelem;
  long k;

  THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
  THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'");
  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'");

  input = THTensor_(newContiguous)(t_);
  if (!(k_->stride[3] == 1) || !(k_->stride[2] == k_->size[3])) {
    kernel = THTensor_(newContiguous)(k_);
  } else {
    THTensor_(retain)(k_);
    kernel = k_;
  }

  nInputPlane = input->size[0];
  istride0    = input->stride[0];
  nInputRows  = input->size[1];
  nInputCols  = input->size[2];

  kstride0    = kernel->stride[0];
  kstride1    = kernel->stride[1];
  nKernelRows = kernel->size[2];
  nKernelCols = kernel->size[3];
  nOutputPlane = kernel->size[0];
  THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes");

  THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dmv : Input image is smaller than kernel");

  if (*vf == 'F') {
    nOutputRows = (nInputRows - 1) * srow + nKernelRows;
    nOutputCols = (nInputCols - 1) * scol + nKernelCols;
  } else { /* valid */
    nOutputRows = (nInputRows - nKernelRows) / srow + 1;
    nOutputCols = (nInputCols - nKernelCols) / scol + 1;
  }

  nelem = THTensor_(nElement)(r_);
  THTensor_(resize3d)(r_, nOutputPlane, nOutputRows, nOutputCols);

  input_data = THTensor_(data)(input);
  weight_data = THTensor_(data)(kernel);
  output_data = THTensor_(data)(r_);

  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
  {
    /*THTensor_(zero)(r_);*/
#pragma omp parallel for private(k)
    for (k = 0; k < r_->size[0]; k++)
    {
      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
      long l;
      for (l = 0; l < nOutputRows*nOutputCols; l++)
        ptr_output[l] = 0.0;
    }
  }
  else if (beta != 1)
  {
    /*THTensor_(mul)(r_, beta);*/
#pragma omp parallel for private(k)
    for (k = 0; k < r_->size[0]; k++)
    {
      real* ptr_output = output_data + k*nOutputCols*nOutputRows;
      long l;
      for (l = 0; l < nOutputRows*nOutputCols; l++)
        ptr_output[l] *= beta;
    }
  }

#pragma omp parallel for private(k)
  for(k = 0; k < nOutputPlane; k++)
  {
    long i;
    /* get output */
    real *ptr_output = output_data + k*nOutputCols*nOutputRows;
    for(i = 0; i < nInputPlane; i++)
    {
      /* get kernel */
      real *ptr_weight = weight_data + k*kstride0 + i*kstride1;
      /* get input */
      real *ptr_input = input_data + i*istride0;

      /* do image, kernel convolution */
      if (*vf == 'F')
        if (*xc == 'X')
          THTensor_(fullXCorr2Dptr)(ptr_output,
                                    alpha,
                                    ptr_input,  nInputRows,  nInputCols,
                                    ptr_weight, nKernelRows, nKernelCols,
                                    srow, scol);
        else
          THTensor_(fullConv2Dptr)(ptr_output,
                                   alpha,
                                   ptr_input,  nInputRows,  nInputCols,
                                   ptr_weight, nKernelRows, nKernelCols,
                                   srow, scol);
      else
        if (*xc == 'X')
          THTensor_(validXCorr2Dptr)(ptr_output,
                                     alpha,
                                     ptr_input,  nInputRows,  nInputCols,
                                     ptr_weight, nKernelRows, nKernelCols,
                                     srow, scol);
        else
          THTensor_(validConv2Dptr)(ptr_output,
                                    alpha,
                                    ptr_input,  nInputRows,  nInputCols,
                                    ptr_weight, nKernelRows, nKernelCols,
                                    srow, scol);
    }
    /* Next output plane */
    /* output_data += nOutputCols*nOutputRows;*/
  }
  THTensor_(free)(input);
  THTensor_(free)(kernel);
}


/*
  3D input, 4D kernel, 3D output
  matrix vector product like
  y <- Ax + beta*y
*/
void THTensor_(conv2Dmm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc)
{
  long nInputPlane, nInputRows, nInputCols;
  long nKernelRows, nKernelCols;
  long nOutputPlane, nOutputRows, nOutputCols;
  long kstride0, kstride1;
  THTensor *input;
  THTensor* kernel;
  long nbatch;
  ptrdiff_t nelem;
  real *input_data;
  real *weight_data;
  real *output_data;
  long p;

  THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
  THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'");
  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'");

  input = THTensor_(newContiguous)(t_);
  if (!(k_->stride[3] == 1) || !(k_->stride[2] == k_->size[3])) {
    kernel = THTensor_(newContiguous)(k_);
  } else {
    THTensor_(retain)(k_);
    kernel = k_;
  }

  nbatch = input->size[0];
  nInputPlane = input->size[1];
  nInputRows  = input->size[2];
  nInputCols  = input->size[3];

  kstride0    = kernel->stride[0];
  kstride1    = kernel->stride[1];
  nKernelRows = kernel->size[2];
  nKernelCols = kernel->size[3];
  nOutputPlane = kernel->size[0];
  THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes");

  THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dmv : Input image is smaller than kernel");

  if (*vf == 'F') {
    nOutputRows = (nInputRows - 1) * srow + nKernelRows;
    nOutputCols = (nInputCols - 1) * scol + nKernelCols;
  } else { /* valid */
    nOutputRows = (nInputRows - nKernelRows) / srow + 1;
    nOutputCols = (nInputCols - nKernelCols) / scol + 1;
  }

  nelem = THTensor_(nElement)(r_);
  THTensor_(resize4d)(r_, nbatch, nOutputPlane, nOutputRows, nOutputCols);

  input_data = THTensor_(data)(input);
  weight_data = THTensor_(data)(kernel);
  output_data = THTensor_(data)(r_);

  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
  {
    /*THTensor_(zero)(r_);*/
#pragma omp parallel for private(p)
    for (p=0; p < r_->size[0]; p++)
    {
      long k;
      for (k = 0; k < r_->size[1]; k++)
      {
        real* ptr_output = output_data + p*nOutputPlane*nOutputRows*nOutputCols + k*nOutputCols*nOutputRows;
        long l;
        for (l = 0; l < nOutputRows*nOutputCols; l++)
          ptr_output[l] = 0.0;
      }
    }
  }
  else if (beta != 1)
  {
    /*THTensor_(mul)(r_, beta);*/
#pragma omp parallel for private(p)
    for(p=0; p < r_->size[0]; p++)
    {
      long k;
      for (k = 0; k < r_->size[1]; k++)
      {
        real* ptr_output = output_data + p*nOutputPlane*nOutputRows*nOutputCols + k*nOutputCols*nOutputRows;
        long l;
        for (l = 0; l < nOutputRows*nOutputCols; l++)
          ptr_output[l] *= beta;
      }
    }
  }

#pragma omp parallel for private(p)
  for(p=0; p < nbatch; p++)
  {
    long k;
    for(k = 0; k < nOutputPlane; k++)
    {
      long i;
      /* get output */
      real *ptr_output = output_data + p*nOutputPlane*nOutputCols*nOutputRows + k*nOutputCols*nOutputRows;
      for(i = 0; i < nInputPlane; i++)
      {
        /* get kernel */
        real *ptr_weight = weight_data + k*kstride0 + i*kstride1;
        /* get input */
        real *ptr_input = input_data + p*nInputPlane*nInputRows*nInputCols + i*nInputRows*nInputCols;

        /* do image, kernel convolution */
        if (*vf == 'F')
          if (*xc == 'X')
            THTensor_(fullXCorr2Dptr)(ptr_output,
                                      alpha,
                                      ptr_input,  nInputRows,  nInputCols,
                                      ptr_weight, nKernelRows, nKernelCols,
                                      srow, scol);
          else
            THTensor_(fullConv2Dptr)(ptr_output,
                                     alpha,
                                     ptr_input,  nInputRows,  nInputCols,
                                     ptr_weight, nKernelRows, nKernelCols,
                                     srow, scol);
        else
          if (*xc == 'X')
            THTensor_(validXCorr2Dptr)(ptr_output,
                                       alpha,
                                       ptr_input,  nInputRows,  nInputCols,
                                       ptr_weight, nKernelRows, nKernelCols,
                                       srow, scol);
          else
            THTensor_(validConv2Dptr)(ptr_output,
                                      alpha,
                                      ptr_input,  nInputRows,  nInputCols,
                                      ptr_weight, nKernelRows, nKernelCols,
                                      srow, scol);
      }
      /* Next output plane */
      /* output_data += nOutputCols*nOutputRows;*/
    }
  }
  THTensor_(free)(input);
  THTensor_(free)(kernel);
}


/*
  2D input, 2D kernel, 2D output
  scalar multiplication like
  y <- x*y + beta*y
*/
void THTensor_(conv2Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc)
{
  THTensor *input;
  THTensor* kernel;
  long nInputRows;
  long nInputCols;
  long nKernelRows;
  long nKernelCols;
  long nOutputRows, nOutputCols;
  real *ptr_input;
  real *ptr_weight;
  real *output_data;
  ptrdiff_t nelem;

  THArgCheck(t_->nDimension == 2 , 3, "input: 2D Tensor expected");
  THArgCheck(k_->nDimension == 2 , 4, "kernel: 2D Tensor expected");
  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");

  input = THTensor_(newContiguous)(t_);
  kernel = THTensor_(newContiguous)(k_);

  nInputRows  = input->size[0];
  nInputCols  = input->size[1];
  nKernelRows = kernel->size[0];
  nKernelCols = kernel->size[1];

  THArgCheck((nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dmul : Input image is smaller than kernel");

  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);

  nelem = THTensor_(nElement)(r_);
  THTensor_(resize2d)(r_, nOutputRows, nOutputCols);
  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
    THTensor_(zero)(r_);
  else if (beta != 1)
    THTensor_(mul)(r_, r_, beta);

  ptr_input = THTensor_(data)(input);
  ptr_weight = THTensor_(data)(kernel);
  output_data = THTensor_(data)(r_);


  /* do image, kernel convolution */
  THTensor_(conv2d)(output_data,
                    alpha,
                    ptr_input, nInputRows, nInputCols,
                    ptr_weight, nKernelRows, nKernelCols,
                    srow, scol, vf, xc);
  THTensor_(free)(input);
  THTensor_(free)(kernel);
}

/*
  3D input, 3D kernel, 3D output
  component wise multiplication like
  y <- y.*x + beta*y
*/
void THTensor_(conv2Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc)
{
  long nInputPlane, nInputRows, nInputCols;
  long nKernelRows, nKernelCols;
  long nOutputPlane, nOutputRows, nOutputCols;
  long istride0, kstride0;
  THTensor *input;
  THTensor *kernel;
  real *input_data;
  real *weight_data;
  real *output_data;
  ptrdiff_t nelem;
  long k;

  THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
  THArgCheck(k_->nDimension == 3 , 4, "kernel: 3D Tensor expected");
  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");

  input = THTensor_(newContiguous)(t_);
  kernel = THTensor_(newContiguous)(k_);

  istride0    = input->stride[0];
  nInputPlane = input->size[0];
  nInputRows  = input->size[1];
  nInputCols  = input->size[2];

  kstride0    = kernel->stride[0];
  nOutputPlane = kernel->size[0];
  nKernelRows = kernel->size[1];
  nKernelCols = kernel->size[2];

  THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes");
  THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dcmul : Input image is smaller than kernel");

  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);

  nelem = THTensor_(nElement)(r_);
  THTensor_(resize3d)(r_, nOutputPlane, nOutputRows, nOutputCols);

  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
  {
    THTensor_(zero)(r_);
  }
  else if (beta != 1)
    THTensor_(mul)(r_, r_, beta);

  input_data = THTensor_(data)(input);
  weight_data = THTensor_(data)(kernel);
  output_data = THTensor_(data)(r_);

  for(k = 0; k < nOutputPlane; k++)
  {
    /* get kernel */
    real *ptr_weight = weight_data + k*kstride0;
    /* get input */
    real *ptr_input = input_data + k*istride0;

    /* do image, kernel convolution */
    THTensor_(conv2d)(output_data,
                      alpha,
                      ptr_input, nInputRows, nInputCols,
                      ptr_weight, nKernelRows, nKernelCols,
                      srow, scol, vf, xc);
    /* Next output plane */
    output_data += nOutputCols*nOutputRows;
  }
  THTensor_(free)(input);
  THTensor_(free)(kernel);
}

/*
  3D input, 3D kernel, 3D output
  component wise multiplication like with a permutation map
  y <- y.*x + beta*y
*/
void THTensor_(conv2Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, THTensor *map, long srow, long scol, const char *vf, const char *xc)
{
  long nInputPlane, nInputRows, nInputCols;
  long nKernelRows, nKernelCols;
  long nOutputPlane, nOutputRows, nOutputCols;
  long istride0, kstride0;
  THTensor *input;
  THTensor* kernel;
  real *input_data;
  real *weight_data;
  real *output_data;
  long nmaps;
  ptrdiff_t nelem;
  long k;

  THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
  THArgCheck(k_->nDimension == 3 , 4, "kernel: 3D Tensor expected");
  THArgCheck(map->nDimension == 2 , 4, "map: 2D Tensor expected");
  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");

  input = THTensor_(newContiguous)(t_);
  kernel = THTensor_(newContiguous)(k_);

  istride0    = input->stride[0];
  nInputPlane = input->size[0];
  nInputRows  = input->size[1];
  nInputCols  = input->size[2];

  kstride0    = kernel->stride[0];
  nOutputPlane = kernel->size[0];
  nKernelRows = kernel->size[1];
  nKernelCols = kernel->size[2];

  THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes");
  THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols)
              || *vf == 'F', 2, "conv2Dmap : Input image is smaller than kernel");

  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);

  nelem = THTensor_(nElement)(r_);
  THTensor_(resize3d)(r_, nOutputPlane, nOutputRows, nOutputCols);

  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
  {
    THTensor_(zero)(r_);
  }
  else if (beta != 1)
    THTensor_(mul)(r_, r_, beta);

  input_data = THTensor_(data)(input);
  weight_data = THTensor_(data)(kernel);
  output_data = THTensor_(data)(r_);

  nmaps = map->size[0];

  for(k = 0; k < nmaps; k++)
  {
    /* get indices */
    long from = (long)THTensor_(get2d)(map,k,0)-1;
    long to   = (long)THTensor_(get2d)(map,k,1)-1;

    /* get kernel */
    real *ptr_weight = weight_data + k*kstride0;
    /* get input */
    real *ptr_input = input_data + from*istride0;
    /* get output */
    real *ptr_output = output_data + to*nOutputRows*nOutputCols;

    /* do image, kernel convolution */
    THTensor_(conv2d)(ptr_output,
                      alpha,
                      ptr_input, nInputRows, nInputCols,
                      ptr_weight, nKernelRows, nKernelCols,
                      srow, scol, vf, xc);
  }
  THTensor_(free)(input);
  THTensor_(free)(kernel);
}

/*
  4D input, 4D kernel, 5D output
  like rank1 update
  A <- xx' + beta*A
  for sr,sc=1 this is equivalent to xcorr2Dger, but otherwise it is useful for
  calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
*/
void THTensor_(conv3DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
                             long sdepth, long srow, long scol)
{
  long nInputPlane, nInputDepth, nInputRows, nInputCols;
  long nKernelPlane, nKernelDepth, nKernelRows, nKernelCols;
  long nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
  long istride0, kstride0;
  THTensor *input;
  THTensor *kernel;
  real *input_data;
  real *weight_data;
  real *output_data;
  ptrdiff_t nelem;
  long k, i;

  THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
  THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
  THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer");
  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");

  input = THTensor_(newContiguous)(t_);
  kernel = THTensor_(newContiguous)(k_);

  nInputPlane = input->size[0];
  istride0    = input->stride[0];
  nInputDepth = input->size[1];
  nInputRows  = input->size[2];
  nInputCols  = input->size[3];

  kstride0 = kernel->stride[0];
  nKernelPlane = kernel->size[0];
  nKernelDepth= kernel->size[1];
  nKernelRows = kernel->size[2];
  nKernelCols = kernel->size[3];
  nOutputPlane = nInputPlane * kernel->size[0];

  THArgCheck(nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, "conv3DRevger : Input image is smaller than kernel");

  nOutputDepth = nInputDepth - (nKernelDepth - 1) * sdepth;
  nOutputRows = nInputRows - (nKernelRows - 1) * srow;
  nOutputCols = nInputCols - (nKernelCols - 1) * scol;

  nelem = THTensor_(nElement)(r_);
  THTensor_(resize5d)(r_,nKernelPlane, nInputPlane, nOutputDepth, nOutputRows, nOutputCols);

  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
  {
    THTensor_(zero)(r_);
  }
  else if (beta != 1)
    THTensor_(mul)(r_, r_, beta);

  input_data = THTensor_(data)(input);
  weight_data = THTensor_(data)(kernel);
  output_data = THTensor_(data)(r_);

  for(k = 0; k < nKernelPlane; k++)
  {
    /* get kernel */
    real *ptr_weight = weight_data+k*kstride0;

    for(i = 0; i < nInputPlane; i++)
    {
      /* get input */
      real *ptr_input = input_data+i*istride0;

      /* do image, kernel convolution */
      THTensor_(validXCorr3DRevptr)(output_data,
                                    alpha,
                                    ptr_input,  nInputDepth, nInputRows,  nInputCols,
                                    ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
                                    sdepth, srow, scol);
      /* Next output plane */
      output_data += nOutputDepth*nOutputCols*nOutputRows;
    }
  }
  THTensor_(free)(input);
  THTensor_(free)(kernel);
}


/*
  4D input, 4D kernel, 5D output
  like rank1 update
  A <- xx' + beta*A
*/
void THTensor_(conv3Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
                          long sdepth, long srow, long scol, const char *vf, const char *xc)
{
  long nInputPlane, nInputDepth, nInputRows, nInputCols;
  long nKernelPlane, nKernelDepth, nKernelRows, nKernelCols;
  long nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
  long istride0, kstride0;
  THTensor *input;
  THTensor *kernel;
  real *input_data;
  real *weight_data;
  real *output_data;
  ptrdiff_t nelem;
  long k, i;

  THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
  THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
  THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer");
  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
  THArgCheck(*vf == 'V' || *vf == 'F', 8, "type of convolution can 'V' or 'F'");
  THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'");

  input = THTensor_(newContiguous)(t_);
  kernel = THTensor_(newContiguous)(k_);

  nInputPlane = input->size[0];
  istride0    = input->stride[0];
  nInputDepth = input->size[1];
  nInputRows  = input->size[2];
  nInputCols  = input->size[3];

  kstride0     = kernel->stride[0];
  nKernelPlane = kernel->size[0];
  nKernelDepth = kernel->size[1];
  nKernelRows  = kernel->size[2];
  nKernelCols  = kernel->size[3];
  nOutputPlane = nInputPlane * kernel->size[0];

  THArgCheck((nInputDepth >= nKernelDepth
              && nInputRows >= nKernelRows
              && nInputCols >= nKernelCols)
             || *vf == 'F', 2, "conv3Dger : Input image is smaller than kernel");

  nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);

  nelem = THTensor_(nElement)(r_);
  THTensor_(resize5d)(r_,nKernelPlane, nInputPlane, nOutputDepth, nOutputRows, nOutputCols);

  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
  {
    THTensor_(zero)(r_);
  }
  else if (beta != 1)
    THTensor_(mul)(r_, r_, beta);

  input_data = THTensor_(data)(input);
  weight_data = THTensor_(data)(kernel);
  output_data = THTensor_(data)(r_);

  for(k = 0; k < nKernelPlane; k++)
  {
    /* get kernel */
    real *ptr_weight = weight_data+k*kstride0;

    for(i = 0; i < nInputPlane; i++)
    {
      /* get input */
      real *ptr_input = input_data+i*istride0;

      /* do image, kernel convolution */
      THTensor_(conv3d)(output_data,
                        alpha,
                        ptr_input,  nInputDepth, nInputRows,  nInputCols,
                        ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
                        sdepth, srow, scol, vf, xc);

      /* Next output plane */
      output_data += nOutputDepth*nOutputCols*nOutputRows;
    }
  }
  THTensor_(free)(input);
  THTensor_(free)(kernel);
}

/*
  4D input, 5D kernel, 4D output
  matrix vector product like
  y <- Ax + beta*y
*/
void THTensor_(conv3Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
                         long sdepth, long srow, long scol, const char *vf, const char *xc)
{
  long nInputPlane, nInputDepth, nInputRows, nInputCols;
  long nKernelDepth, nKernelRows, nKernelCols;
  long nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
  long istride0, kstride0, kstride1;
  THTensor *input;
  THTensor *kernel;
  real *input_data;
  real *weight_data;
  real *output_data;
  ptrdiff_t nelem;
  long k, i;

  THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
  THArgCheck(k_->nDimension == 5 , 4, "kernel: 5D Tensor expected");
  THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer");
  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
  THArgCheck(*vf == 'V' || *vf == 'F', 8, "type of convolution can 'V' or 'F'");
  THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'");

  input = THTensor_(newContiguous)(t_);
  if (!(k_->stride[4] == 1) || !(k_->stride[3] == k_->size[4])) {
    kernel = THTensor_(newContiguous)(k_);
  } else {
    THTensor_(retain)(k_);
    kernel = k_;
  }

  nInputPlane = input->size[0];
  istride0    = input->stride[0];
  nInputDepth = input->size[1];
  nInputRows  = input->size[2];
  nInputCols  = input->size[3];

  kstride0    = kernel->stride[0];
  kstride1    = kernel->stride[1];
  nKernelDepth = kernel->size[2];
  nKernelRows = kernel->size[3];
  nKernelCols = kernel->size[4];
  nOutputPlane = kernel->size[0];
  THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes");

  THArgCheck( (nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv3Dmv : Input image is smaller than kernel");

  nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);

  nelem = THTensor_(nElement)(r_);
  THTensor_(resize4d)(r_, nOutputPlane, nOutputDepth, nOutputRows, nOutputCols);

  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
  {
    THTensor_(zero)(r_);
  }
  else if (beta != 1)
    THTensor_(mul)(r_, r_, beta);

  input_data = THTensor_(data)(input);
  weight_data = THTensor_(data)(kernel);
  output_data = THTensor_(data)(r_);

  for(k = 0; k < nOutputPlane; k++)
  {
    for(i = 0; i < nInputPlane; i++)
    {
      /* get kernel */
      real *ptr_weight = weight_data + k*kstride0 + i*kstride1;
      /* get input */
      real *ptr_input = input_data + i*istride0;

      /* do image, kernel convolution */
      THTensor_(conv3d)(output_data,
                        alpha,
                        ptr_input,  nInputDepth, nInputRows,  nInputCols,
                        ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
                        sdepth, srow, scol, vf, xc);
    }
    /* Next output plane */
    output_data += nOutputDepth*nOutputCols*nOutputRows;
  }
  THTensor_(free)(input);
  THTensor_(free)(kernel);
}

/*
  3D input, 3D kernel, 3D output
  scalar multiplication like
  y <- x*y + beta*y
*/
void THTensor_(conv3Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
                          long sdepth, long srow, long scol, const char *vf, const char *xc)
{
  THTensor *input;
  THTensor* kernel;
  long nInputDepth;
  long nInputRows;
  long nInputCols;
  long nKernelDepth;
  long nKernelRows;
  long nKernelCols;
  long nOutputDepth, nOutputRows, nOutputCols;
  real *ptr_input;
  real *ptr_weight;
  real *output_data;
  ptrdiff_t nelem;

  THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
  THArgCheck(k_->nDimension == 3 , 4, "kernel: 3D Tensor expected");
  THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer");
  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
  THArgCheck(*vf == 'V' || *vf == 'F', 8, "type of convolution can 'V' or 'F'");
  THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'");

  input = THTensor_(newContiguous)(t_);
  kernel = THTensor_(newContiguous)(k_);

  nInputDepth = input->size[0];
  nInputRows  = input->size[1];
  nInputCols  = input->size[2];
  nKernelDepth = kernel->size[0];
  nKernelRows = kernel->size[1];
  nKernelCols = kernel->size[2];

  THArgCheck((nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv3Dmul : Input image is smaller than kernel");

  nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);

  nelem = THTensor_(nElement)(r_);
  THTensor_(resize3d)(r_, nOutputDepth, nOutputRows, nOutputCols);
  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
    THTensor_(zero)(r_);
  else if (beta != 1)
    THTensor_(mul)(r_, r_, beta);

  ptr_input = THTensor_(data)(input);
  ptr_weight = THTensor_(data)(kernel);
  output_data = THTensor_(data)(r_);


  /* do image, kernel convolution */
  THTensor_(conv3d)(output_data,
                    alpha,
                    ptr_input,  nInputDepth, nInputRows,  nInputCols,
                    ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
                    sdepth, srow, scol, vf, xc);
  THTensor_(free)(input);
  THTensor_(free)(kernel);
}

/*
  4D input, 4D kernel, 4D output
  component wise multiplication like
  y <- y.*x + beta*y
*/
void THTensor_(conv3Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
                           long sdepth, long srow, long scol, const char *vf, const char *xc)
{
  long nInputPlane, nInputDepth, nInputRows, nInputCols;
  long nKernelDepth, nKernelRows, nKernelCols;
  long nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
  long istride0, kstride0;

  THTensor *input;
  THTensor *kernel;
  real *input_data;
  real *weight_data;
  real *output_data;
  ptrdiff_t nelem;
  long k;

  THArgCheck(t_->nDimension == 4 , 3, "input: 3D Tensor expected");
  THArgCheck(k_->nDimension == 4 , 4, "kernel: 3D Tensor expected");
  THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
  THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
  THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'");
  THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'");

  input = THTensor_(newContiguous)(t_);
  kernel = THTensor_(newContiguous)(k_);

  istride0    = input->stride[0];
  nInputPlane = input->size[0];
  nInputDepth = input->size[1];
  nInputRows  = input->size[2];
  nInputCols  = input->size[3];

  kstride0    = kernel->stride[0];
  nOutputPlane = kernel->size[0];
  nKernelDepth = kernel->size[1];
  nKernelRows = kernel->size[2];
  nKernelCols = kernel->size[3];

  THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes");
  THArgCheck( (nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv3Dcmul : Input image is smaller than kernel");

  nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);

  nelem = THTensor_(nElement)(r_);
  THTensor_(resize4d)(r_, nOutputPlane, nOutputDepth, nOutputRows, nOutputCols);

  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
  {
    THTensor_(zero)(r_);
  }
  else if (beta != 1)
    THTensor_(mul)(r_, r_, beta);

  input_data = THTensor_(data)(input);
  weight_data = THTensor_(data)(kernel);
  output_data = THTensor_(data)(r_);

  for(k = 0; k < nOutputPlane; k++)
  {
    /* get kernel */
    real *ptr_weight = weight_data + k*kstride0;
    /* get input */
    real *ptr_input = input_data + k*istride0;

    /* do image, kernel convolution */
    THTensor_(conv3d)(output_data,
                      alpha,
                      ptr_input,  nInputDepth, nInputRows,  nInputCols,
                      ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
                      sdepth, srow, scol, vf, xc);

    /* Next output plane */
    output_data += nOutputDepth*nOutputCols*nOutputRows;
  }
  THTensor_(free)(input);
  THTensor_(free)(kernel);
}

/*
  4D input, 4D kernel, 4D output
  component wise multiplication like with a permutation map
  y <- y.*x + beta*y
*/
void THTensor_(conv3Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, THTensor *map,
                          long sdepth, long srow, long scol, const char *vf, const char *xc)
{
  long nInputPlane, nInputDepth, nInputRows, nInputCols;
  long nKernelDepth, nKernelRows, nKernelCols;
  long nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
  long istride0, kstride0;

  THTensor *input;
  THTensor *kernel;
  ptrdiff_t nelem;
  real *input_data;
  real *weight_data;
  real *output_data;
  long nmaps;
  long k;

  THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
  THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
  THArgCheck(map->nDimension == 2 , 4, "map: 2D Tensor expected");
  THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
  THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
  THArgCheck(*vf == 'V' || *vf == 'F', 8, "type of convolution can 'V' or 'F'");
  THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'");

  input = THTensor_(newContiguous)(t_);
  kernel = THTensor_(newContiguous)(k_);

  istride0    = input->stride[0];
  nInputPlane = input->size[0];
  nInputDepth = input->size[1];
  nInputRows  = input->size[2];
  nInputCols  = input->size[3];

  kstride0    = kernel->stride[0];
  nOutputPlane = kernel->size[0];
  nKernelDepth = kernel->size[1];
  nKernelRows = kernel->size[2];
  nKernelCols = kernel->size[3];

  THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes");
  THArgCheck((nInputDepth >= nKernelDepth
              && nInputRows >= nKernelRows
              && nInputCols >= nKernelCols) || *vf == 'F',
             2, "conv3Dmap : Input image is smaller than kernel");

  nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
  nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
  nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);

  nelem = THTensor_(nElement)(r_);
  THTensor_(resize4d)(r_, nOutputPlane, nOutputDepth, nOutputRows, nOutputCols);

  if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
  {
    THTensor_(zero)(r_);
  }
  else if (beta != 1)
    THTensor_(mul)(r_, r_, beta);

  input_data = THTensor_(data)(input);
  weight_data = THTensor_(data)(kernel);
  output_data = THTensor_(data)(r_);

  nmaps = map->size[0];

  for(k = 0; k < nmaps; k++)
  {
    /* get indices */
    long from = (long)THTensor_(get2d)(map,k,0)-1;
    long to   = (long)THTensor_(get2d)(map,k,1)-1;

    /* get kernel */
    real *ptr_weight = weight_data + k*kstride0;
    /* get input */
    real *ptr_input = input_data + from*istride0;
    /* get output */
    real *ptr_output = output_data + to*nOutputDepth*nOutputRows*nOutputCols;

    /* do image, kernel convolution */
    THTensor_(conv3d)(ptr_output,
                      alpha,
                      ptr_input,  nInputDepth, nInputRows,  nInputCols,
                      ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
                      sdepth, srow, scol, vf, xc);
  }
  THTensor_(free)(input);
  THTensor_(free)(kernel);
}
#endif