summaryrefslogtreecommitdiffstats
path: root/contrib/lua-torch/torch7/lib/TH/generic/simd/convolve.c
blob: da7a4bb20bc4b9a27d6b73a09afd583c09957369 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#if defined(USE_AVX) && defined(__AVX__)

#ifdef _MSC_VER
#include <intrin.h>

static __inline int __get_cpuid (unsigned int __level, unsigned int *__eax,
                                 unsigned int *__ebx, unsigned int *__ecx,
                                 unsigned int *__edx) {
  unsigned int cpui[4];
  __cpuid(cpui, __level);
  *__eax = cpui[0]; *__ebx = cpui[1]; *__ecx = cpui[2]; *__edx = cpui[3];
  return 1;
}

static void xgetbv(unsigned int op, unsigned int* eax, unsigned int* edx) {
  *eax = 0; *edx = 0;
  if (op == 0)
      *eax = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
}

#else

#if __i386__
#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
__asm("  pushl  %%ebx\n" \
"  cpuid\n" \
"  mov    %%ebx,%1\n" \
"  popl   %%ebx" \
: "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
: "0"(__level))
#else
#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
__asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
: "0"(__level))
#endif

static __inline int __get_cpuid (unsigned int __level, unsigned int *__eax,
                                 unsigned int *__ebx, unsigned int *__ecx,
                                 unsigned int *__edx) {
  __cpuid(__level, *__eax, *__ebx, *__ecx, *__edx);
  return 1;
}

static void xgetbv(unsigned int op, unsigned int* eax, unsigned int* edx) {
  __asm__ __volatile__
  (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
}

#endif

enum ECPUFeature
{
  kCPUFeature_SSE = 0x01,
  kCPUFeature_SSE2 = 0x02,
  kCPUFeature_SSE3 = 0x04,
  kCPUFeature_SSE3_S = 0x08,
  kCPUFeature_SSE4_1 = 0x10,
  kCPUFeature_SSE4_2 = 0x20,
  kCPUFeature_AVX = 0x40
};

static unsigned int checkCPUFeatures() {
  unsigned int eax = 0, ebx = 0, ecx = 0, edx = 0;
  unsigned int features = 0;
  __get_cpuid(1, &eax, &ebx, &ecx, &edx);
  if( (edx & (1 << 25)) != 0 ) {
    features |= kCPUFeature_SSE;
  }
  if( (edx & (1 << 26)) != 0 ) {
    features |= kCPUFeature_SSE2;
  }
  if( (ecx & (1 << 0)) != 0 ) {
    features |= kCPUFeature_SSE3;
  }
  if( (ecx & (1 << 9)) != 0 ) {
    features |= kCPUFeature_SSE3_S;
  }
  if( (ecx & (1 << 19)) != 0 ) {
    features |= kCPUFeature_SSE4_1;
  }
  if( (ecx & (1 << 20)) != 0 ) {
    features |= kCPUFeature_SSE4_2;
  }
  if( (ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0 ) {
    xgetbv(0, &eax, &edx);
    if( (eax & 6) == 6 ) {
      features |= kCPUFeature_AVX;
    }
  }
  return features;
}

#include <stdio.h>

static int haveCPUFeature(unsigned int feature) {
  static unsigned int sCPUFeatures = 0;
  static int sDetectedCPUFeatures = 0;
  if (!sDetectedCPUFeatures) {
    sDetectedCPUFeatures = 1;
    sCPUFeatures = checkCPUFeatures();
    if ((sCPUFeatures & kCPUFeature_AVX) != 0) {
      printf("torch running avx\n");
    } else {
      printf("torch running sse \n");
    }
  }
  return (sCPUFeatures & feature) != 0;
}

#endif

void convolve_5x5_sse(float* output, float* input, float* kernel, long outRows, long outCols, long outStride, long inCols);
void convolve_5x5_avx(float* output, float* input, float* kernel, long outRows, long outCols, long outStride, long inCols);

void convolve_5x5(float* output, float* input, float* kernel, long outRows, long outCols, long inCols) {
#if defined(USE_AVX) && defined(__AVX__)
  int avx = haveCPUFeature(kCPUFeature_AVX);
  if (avx)
  {
    convolve_5x5_avx(output, input, kernel, outRows, outCols, outCols, inCols);
  }
  else
#endif
  {
    convolve_5x5_sse(output, input, kernel, outRows, outCols, outCols, inCols);
  }
}