#ifndef TH_GENERIC_FILE #define TH_GENERIC_FILE "generic/unfold.c" #else /* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */ void THNN_(unfolded_acc)( THTensor *finput, THTensor *input, int kW, int kH, int dW, int dH, int padW, int padH, int nInputPlane, int inputWidth, int inputHeight, int outputWidth, int outputHeight) { // This function assumes that // outputHeight*dH does not overflow a long // outputWidth*dW does not overflow a long int nip; real *input_data = THTensor_(data)(input); real *finput_data = THTensor_(data)(finput); #pragma omp parallel for private(nip) for(nip = 0; nip < nInputPlane; nip++) { int kw, kh, y, x; long ix, iy; for(kh = 0; kh < kH; kh++) { for(kw = 0; kw < kW; kw++) { real *src = finput_data + nip*((size_t)kH*kW*outputHeight*outputWidth) + kh*((size_t)kW*outputHeight*outputWidth) + kw*((size_t)outputHeight*outputWidth); real *dst = input_data + nip*((size_t)inputHeight*inputWidth); if (padW > 0 || padH > 0) { int lpad,rpad; for(y = 0; y < outputHeight; y++) { iy = (long)y*dH - padH + kh; if (iy < 0 || iy >= inputHeight) { } else { if (dW==1){ ix = 0 - padW + kw; lpad = fmaxf(0,padW-kw); rpad = fmaxf(0,padW-(kW-kw-1)); real *dst_slice = dst+(size_t)iy*inputWidth+ix+lpad; THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+lpad, 1, outputWidth - lpad - rpad); /* note: THVector_add could handle 1 value better */ } else{ for (x=0; x= inputWidth){ }else{ real *dst_slice = dst+(size_t)iy*inputWidth+ix; THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+x, 1, 1); } } } } } } else { for(y = 0; y < outputHeight; y++) { iy = (long)y*dH + kh; ix = 0 + kw; if (dW == 1 ) { real *dst_slice = dst+(size_t)iy*inputWidth+ix; THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth, 1, outputWidth); /* note: THVector_add could handle 1 value better */ }else{ for(x = 0; x < outputWidth; x++) { real *dst_slice = dst+(size_t)iy*inputWidth+ix+x*dW; THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+x, 1, 1); } } } } } } } } void THNN_(unfolded_copy)( THTensor *finput, THTensor *input, int kW, int kH, int dW, int dH, int padW, int padH, int nInputPlane, int inputWidth, int inputHeight, int outputWidth, int outputHeight) { // This function assumes that // kH*kW does not overflow an int // nInputPlane*kH*kW does not overflow a long // outputHeight*dH does not overflow a long // outputWidth*dW does not overflow a long long k; real *input_data = THTensor_(data)(input); real *finput_data = THTensor_(data)(finput); #pragma omp parallel for private(k) for(k = 0; k < (long)nInputPlane*kH*kW; k++) { long nip = k / (kH*kW); long rest = k % (kH*kW); long kh = rest / kW; long kw = rest % kW; int x, y; long ix, iy; real *dst = finput_data + nip*((size_t)kH*kW*outputHeight*outputWidth) + kh*((size_t)kW*outputHeight*outputWidth) + kw*((size_t)outputHeight*outputWidth); real *src = input_data + nip*((size_t)inputHeight*inputWidth); if (padW > 0 || padH > 0) { long lpad,rpad; for(y = 0; y < outputHeight; y++) { iy = (long)y*dH - padH + kh; if (iy < 0 || iy >= inputHeight) { memset(dst+(size_t)y*outputWidth, 0, sizeof(real)*outputWidth); } else { if (dW==1){ ix = 0 - padW + kw; lpad = fmaxf(0,padW-kw); rpad = fmaxf(0,padW-(kW-kw-1)); if (outputWidth-rpad-lpad <= 0) { memset(dst+(size_t)y*outputWidth, 0, sizeof(real)*outputWidth); } else { if (lpad > 0) memset(dst+(size_t)y*outputWidth, 0, sizeof(real)*lpad); memcpy(dst+(size_t)y*outputWidth+lpad, src+(size_t)iy*inputWidth+ix+lpad, sizeof(real)*(outputWidth-rpad-lpad)); if (rpad > 0) memset(dst+(size_t)y*outputWidth + outputWidth - rpad, 0, sizeof(real)*rpad); } } else{ for (x=0; x= inputWidth) memset(dst+(size_t)y*outputWidth+x, 0, sizeof(real)*1); else memcpy(dst+(size_t)y*outputWidth+x, src+(size_t)iy*inputWidth+ix, sizeof(real)*(1)); } } } } } else { for(y = 0; y < outputHeight; y++) { iy = (long)y*dH + kh; ix = 0 + kw; if (dW == 1) memcpy(dst+(size_t)y*outputWidth, src+(size_t)iy*inputWidth+ix, sizeof(real)*outputWidth); else{ for (x=0; x