#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/TemporalMaxPooling.c"
#else

static inline void THNN_(TemporalMaxPooling_shapeCheck)(
                         THNNState *state,
                         THTensor *input,
                         THTensor *gradOutput,
                         THIndexTensor *indices,
                         int kW,
                         int dW) {
  long niframe;
  long framesize;
  long noframe;

  int dimS = 0; // sequence dimension
  int dimF = 1; // feature dimension
  int ndims = input->nDimension;

  if (input->nDimension == 3)
  {
    dimS = 1;
    dimF = 2;
  }

  niframe = input->size[dimS];
  framesize = input->size[dimF];
  noframe = (niframe - kW) / dW + 1;

  THArgCheck(kW > 0, 5,
             "kernel size should be greater than zero, but got kW: %d", kW);
  THArgCheck(dW > 0, 6,
             "stride should be greater than zero, but got dW: %d", dW);

  THNN_ARGCHECK(input->nDimension == 2 || input->nDimension == 3, 2, input,
                  "2D or 3D (batch mode) tensor expected for input, but got: %s");
  THArgCheck(input->size[dimS] >= kW, 2,
             "input sequence smaller than kernel size. Got: %d, Expected: %d",
             input->size[dimS], kW);

  if (gradOutput != NULL) {
    THNN_CHECK_DIM_SIZE(gradOutput, ndims, dimS, noframe);
    THNN_CHECK_DIM_SIZE(gradOutput, ndims, dimF, framesize)
  }
  if (indices != NULL) {
    THNN_CHECK_DIM_SIZE_INDICES(indices, ndims, dimS, noframe);
    THNN_CHECK_DIM_SIZE_INDICES(indices, ndims, dimF, framesize);
  }
}

void THNN_(TemporalMaxPooling_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          THIndexTensor *indices,
          int kW,
          int dW)
{
  long niframe;
  long framesize;
  long noframe;

  real *input_data;
  real *output_data;
  THIndex_t *indices_data;

  long t, y;

  int dimS = 0; // sequence dimension
  int dimF = 1; // feature dimension

  THNN_(TemporalMaxPooling_shapeCheck)(state, input, NULL, NULL, kW, dW);

  if (input->nDimension == 3)
  {
    dimS = 1;
    dimF = 2;
  }

  /* sizes */
  niframe = input->size[dimS];
  framesize = input->size[dimF];
  noframe = (niframe - kW) / dW + 1;

  /* get contiguous input */
  input = THTensor_(newContiguous)(input);

  if (input->nDimension == 2)
  {
    /* resize output */
    THTensor_(resize2d)(output, noframe, framesize);

    /* indices will contain index locations for each output point */
    THIndexTensor_(resize2d)(indices, noframe, framesize);

    /* get raw pointers */
    input_data = THTensor_(data)(input);
    output_data = THTensor_(data)(output);
    indices_data = THIndexTensor_(data)(indices);

    for(t = 0; t < noframe; t++)
    {
      real *ip = input_data + t*framesize*dW;
      real *op = output_data + t*framesize;
      THIndex_t *xp = indices_data + t*framesize;
#pragma omp parallel for private(y)
      for(y = 0; y < framesize; y++)
      {
        /* compute local max: */
        long maxindex = -1;
        real maxval = -THInf;
        long x;
        for(x = 0; x < kW; x++)
        {
          real val = ip[x*framesize+y];
          if (val > maxval)
          {
            maxval = val;
            maxindex = x;
          }
        }

        /* set output to local max */
        op[y] = maxval;
        xp[y] = (real)maxindex;
      }
    }
  }
  else
  {
    /* number of batch frames */
    long nbframe = input->size[0];
    long i;

    /* resize output */
    THTensor_(resize3d)(output, nbframe, noframe, framesize);

    /* indices will contain index locations for each output point */
    THIndexTensor_(resize3d)(indices, nbframe, noframe, framesize);

    /* get raw pointers */
    input_data = THTensor_(data)(input);
    output_data = THTensor_(data)(output);
    indices_data = THIndexTensor_(data)(indices);

    for(i = 0; i < nbframe; i++)
    {
      real *inputSample_data = input_data + i*niframe*framesize;
      real *outputSample_data = output_data + i*noframe*framesize;
      THIndex_t *indicesSample_data = indices_data + i*noframe*framesize;

      for(t = 0; t < noframe; t++)
      {
        real *ip = inputSample_data + t*framesize*dW;
        real *op = outputSample_data + t*framesize;
        THIndex_t *xp = indicesSample_data + t*framesize;

#pragma omp parallel for private(y)
        for(y = 0; y < framesize; y++)
        {
          /* compute local max: */
          long maxindex = -1;
          real maxval = -THInf;
          long x;
          for(x = 0; x < kW; x++)
          {
            real val = ip[x*framesize+y];
            if (val > maxval)
            {
              maxval = val;
              maxindex = x;
            }
          }

          /* set output to local max */
          op[y] = maxval;
          xp[y] = (real)maxindex;
        }
      }
    }
  }

  /* cleanup */
  THTensor_(free)(input);

}

void THNN_(TemporalMaxPooling_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          THIndexTensor *indices,
          int kW,
          int dW)
{
  long niframe;
  int noframe;
  long framesize;

  real *gradInput_data;
  real *gradOutput_data;
  THIndex_t *indices_data;

  long t, y;

  THNN_(TemporalMaxPooling_shapeCheck)(state, input, gradOutput, indices, kW, dW);
  /* get contiguous gradOutput */
  gradOutput = THTensor_(newContiguous)(gradOutput);

  /* resize and zero */
  THTensor_(resizeAs)(gradInput, input);
  THTensor_(zero)(gradInput);

  int dimS = 0; // sequence dimension
  int dimF = 1; // feature dimension

  if (input->nDimension == 3)
  {
    dimS = 1;
    dimF = 2;
  }
  /* sizes */
  niframe = input->size[dimS];
  noframe = gradOutput->size[dimS];
  framesize = gradOutput->size[dimF];

  /* get raw pointers */
  gradInput_data = THTensor_(data)(gradInput);
  gradOutput_data = THTensor_(data)(gradOutput);
  indices_data = THIndexTensor_(data)(indices);

  if (input->nDimension == 2)
  {
    for(t = 0; t < noframe; t++)
    {
      real *gip = gradInput_data + t*framesize*dW;
      real *gop = gradOutput_data + t*framesize;
      THIndex_t *xp = indices_data + t*framesize;
#pragma omp parallel for private(y)
      for(y = 0; y < framesize; y++)
      {
        /* compute local max: */
        long maxindex = (long)xp[y];
	if (maxindex != -1)
	  gip[maxindex*framesize+y] += gop[y];
      }
    }
  }
  else
  {
    /* number of batch frames */
    long nbframe = input->size[0];
    long i;

    for(i = 0; i < nbframe; i++)
    {
      real *gradInputSample_data = gradInput_data + i*niframe*framesize;
      real *gradOutputSample_data = gradOutput_data + i*noframe*framesize;
      THIndex_t *indicesSample_data = indices_data + i*noframe*framesize;

      for(t = 0; t < noframe; t++)
      {
        real *gip = gradInputSample_data + t*framesize*dW;
        real *gop = gradOutputSample_data + t*framesize;
        THIndex_t *xp = indicesSample_data + t*framesize;
#pragma omp parallel for private(y)
        for(y = 0; y < framesize; y++)
        {
          /* compute local max: */
          long maxindex = (long)xp[y];
	  if (maxindex != -1)
	    gip[maxindex*framesize+y] += gop[y];
        }
      }
    }
  }

  /* cleanup */
  THTensor_(free)(gradOutput);
}

#endif