#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/unfold.c"
#else

/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
void THNN_(unfolded_acc)(
          THTensor *finput,
          THTensor *input,
          int kW,
          int kH,
          int dW,
          int dH,
          int padW,
          int padH,
          int nInputPlane,
          int inputWidth,
          int inputHeight,
          int outputWidth,
          int outputHeight)
{
  // This function assumes that
  // outputHeight*dH does not overflow a long
  // outputWidth*dW does not overflow a long

  int nip;

  real *input_data = THTensor_(data)(input);
  real *finput_data = THTensor_(data)(finput);

#pragma omp parallel for private(nip)
  for(nip = 0; nip < nInputPlane; nip++)
  {
    int kw, kh, y, x;
    long ix, iy;
    for(kh = 0; kh < kH; kh++)
    {
      for(kw = 0; kw < kW; kw++)
      {
        real *src = finput_data + nip*((size_t)kH*kW*outputHeight*outputWidth) + kh*((size_t)kW*outputHeight*outputWidth) + kw*((size_t)outputHeight*outputWidth);
        real *dst = input_data + nip*((size_t)inputHeight*inputWidth);
        if (padW > 0 || padH > 0) {
          int lpad,rpad;
          for(y = 0; y < outputHeight; y++) {
            iy = (long)y*dH - padH + kh;
            if (iy < 0 || iy >= inputHeight) {
            } else {
              if (dW==1){
                 ix = 0 - padW + kw;
                 lpad = fmaxf(0,padW-kw);
                 rpad = fmaxf(0,padW-(kW-kw-1));
                 real *dst_slice = dst+(size_t)iy*inputWidth+ix+lpad;
                 THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+lpad, 1, outputWidth - lpad - rpad); /* note: THVector_add could handle 1 value better */
              }
              else{
                for (x=0; x<outputWidth; x++){
                   ix = (long)x*dW - padW + kw;
                   if (ix < 0 || ix >= inputWidth){
                   }else{
                     real *dst_slice = dst+(size_t)iy*inputWidth+ix;
                     THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+x, 1, 1);
                   }
                }
              }
            }
          }
        } else {
          for(y = 0; y < outputHeight; y++) {
            iy = (long)y*dH + kh;
            ix = 0 + kw;
            if (dW == 1 ) {
               real *dst_slice = dst+(size_t)iy*inputWidth+ix;
               THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth, 1, outputWidth); /* note: THVector_add could handle 1 value better */
            }else{
              for(x = 0; x < outputWidth; x++) {
                real *dst_slice = dst+(size_t)iy*inputWidth+ix+x*dW;
                THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+x, 1, 1);
              }
            }
          }
        }
      }
    }
  }
}

void THNN_(unfolded_copy)(
          THTensor *finput,
          THTensor *input,
          int kW,
          int kH,
          int dW,
          int dH,
          int padW,
          int padH,
          int nInputPlane,
          int inputWidth,
          int inputHeight,
          int outputWidth,
          int outputHeight)
{
  // This function assumes that
  // kH*kW does not overflow an int
  // nInputPlane*kH*kW does not overflow a long
  // outputHeight*dH does not overflow a long
  // outputWidth*dW does not overflow a long

  long k;
  real *input_data = THTensor_(data)(input);
  real *finput_data = THTensor_(data)(finput);

#pragma omp parallel for private(k)
  for(k = 0; k < (long)nInputPlane*kH*kW; k++) {
    long nip = k / (kH*kW);
    long rest = k % (kH*kW);
    long kh = rest / kW;
    long kw = rest % kW;
    int x, y;
    long ix, iy;
    real *dst = finput_data + nip*((size_t)kH*kW*outputHeight*outputWidth) + kh*((size_t)kW*outputHeight*outputWidth) + kw*((size_t)outputHeight*outputWidth);
    real *src = input_data + nip*((size_t)inputHeight*inputWidth);
    if (padW > 0 || padH > 0) {
      long lpad,rpad;
      for(y = 0; y < outputHeight; y++) {
        iy = (long)y*dH - padH + kh;
        if (iy < 0 || iy >= inputHeight) {
          memset(dst+(size_t)y*outputWidth, 0, sizeof(real)*outputWidth);
        } else {
          if (dW==1){
             ix = 0 - padW + kw;
             lpad = fmaxf(0,padW-kw);
             rpad = fmaxf(0,padW-(kW-kw-1));
             if (outputWidth-rpad-lpad <= 0) {
                memset(dst+(size_t)y*outputWidth, 0, sizeof(real)*outputWidth);
             } else {
                if (lpad > 0) memset(dst+(size_t)y*outputWidth, 0, sizeof(real)*lpad);
                memcpy(dst+(size_t)y*outputWidth+lpad, src+(size_t)iy*inputWidth+ix+lpad, sizeof(real)*(outputWidth-rpad-lpad));
                if (rpad > 0) memset(dst+(size_t)y*outputWidth + outputWidth - rpad, 0, sizeof(real)*rpad);
             }
          }
          else{
            for (x=0; x<outputWidth; x++){
               ix = (long)x*dW - padW + kw;
               if (ix < 0 || ix >= inputWidth)
                 memset(dst+(size_t)y*outputWidth+x, 0, sizeof(real)*1);
               else
                 memcpy(dst+(size_t)y*outputWidth+x, src+(size_t)iy*inputWidth+ix, sizeof(real)*(1));
            }
          }
        }
      }
    } else {
      for(y = 0; y < outputHeight; y++) {
        iy = (long)y*dH + kh;
        ix = 0 + kw;
        if (dW == 1)
           memcpy(dst+(size_t)y*outputWidth, src+(size_t)iy*inputWidth+ix, sizeof(real)*outputWidth);
        else{
          for (x=0; x<outputWidth; x++)
             memcpy(dst+(size_t)y*outputWidth+x, src+(size_t)iy*inputWidth+ix+(long)x*dW, sizeof(real)*(1));
         }
      }
    }
  }
}

#endif