You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

avx.c 5.4KB


  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "cryptobox.h"
  18. #include "curve25519.h"
  19. #include "platform_config.h"
  20. typedef struct {
  21. guint64 v[5];
  22. } fe51;
  23. typedef guint64 fe[10];
  24. extern void ladder_avx (fe *var, const guchar *p);
  25. extern void ladder_base_avx (fe *var, const guchar *p);
  26. extern void fe51_mul_avx (fe51 *a, const fe51 *b, const fe51 *c);
  27. extern void fe51_pack_avx (guchar *out, const fe51 *var);
  28. extern void fe51_nsquare_avx (fe51 *a, const fe51 *b, gint n);
  29. static guint64 load_3 (const unsigned char *in)
  30. {
  31. guint64 result;
  32. result = (guint64) in[0];
  33. result |= ((guint64) in[1]) << 8;
  34. result |= ((guint64) in[2]) << 16;
  35. return result;
  36. }
  37. static guint64 load_4 (const unsigned char *in)
  38. {
  39. guint64 result;
  40. result = (guint64) in[0];
  41. result |= ((guint64) in[1]) << 8;
  42. result |= ((guint64) in[2]) << 16;
  43. result |= ((guint64) in[3]) << 24;
  44. return result;
  45. }
  46. static void
  47. fe_frombytes (fe h, const unsigned char *s)
  48. {
  49. guint64 h0 = load_4 (s);
  50. guint64 h1 = load_3 (s + 4) << 6;
  51. guint64 h2 = load_3 (s + 7) << 5;
  52. guint64 h3 = load_3 (s + 10) << 3;
  53. guint64 h4 = load_3 (s + 13) << 2;
  54. guint64 h5 = load_4 (s + 16);
  55. guint64 h6 = load_3 (s + 20) << 7;
  56. guint64 h7 = load_3 (s + 23) << 5;
  57. guint64 h8 = load_3 (s + 26) << 4;
  58. guint64 h9 = load_3 (s + 29) << 2;
  59. guint64 carry0;
  60. guint64 carry1;
  61. guint64 carry2;
  62. guint64 carry3;
  63. guint64 carry4;
  64. guint64 carry5;
  65. guint64 carry6;
  66. guint64 carry7;
  67. guint64 carry8;
  68. guint64 carry9;
  69. carry9 = h9 >> 25;
  70. h0 += carry9 * 19;
  71. h9 &= 0x1FFFFFF;
  72. carry1 = h1 >> 25;
  73. h2 += carry1;
  74. h1 &= 0x1FFFFFF;
  75. carry3 = h3 >> 25;
  76. h4 += carry3;
  77. h3 &= 0x1FFFFFF;
  78. carry5 = h5 >> 25;
  79. h6 += carry5;
  80. h5 &= 0x1FFFFFF;
  81. carry7 = h7 >> 25;
  82. h8 += carry7;
  83. h7 &= 0x1FFFFFF;
  84. carry0 = h0 >> 26;
  85. h1 += carry0;
  86. h0 &= 0x3FFFFFF;
  87. carry2 = h2 >> 26;
  88. h3 += carry2;
  89. h2 &= 0x3FFFFFF;
  90. carry4 = h4 >> 26;
  91. h5 += carry4;
  92. h4 &= 0x3FFFFFF;
  93. carry6 = h6 >> 26;
  94. h7 += carry6;
  95. h6 &= 0x3FFFFFF;
  96. carry8 = h8 >> 26;
  97. h9 += carry8;
  98. h8 &= 0x3FFFFFF;
  99. h[0] = h0;
  100. h[1] = h1;
  101. h[2] = h2;
  102. h[3] = h3;
  103. h[4] = h4;
  104. h[5] = h5;
  105. h[6] = h6;
  106. h[7] = h7;
  107. h[8] = h8;
  108. h[9] = h9;
  109. }
  110. #define fe51_square(x, y) fe51_nsquare_avx(x, y, 1)
  111. void fe51_invert (fe51 *r, const fe51 *x)
  112. {
  113. fe51 z2;
  114. fe51 z9;
  115. fe51 z11;
  116. fe51 z2_5_0;
  117. fe51 z2_10_0;
  118. fe51 z2_20_0;
  119. fe51 z2_50_0;
  120. fe51 z2_100_0;
  121. fe51 t;
  122. /* 2 */ fe51_square (&z2, x);
  123. /* 4 */ fe51_square (&t, &z2);
  124. /* 8 */ fe51_square (&t, &t);
  125. /* 9 */ fe51_mul_avx (&z9, &t, x);
  126. /* 11 */ fe51_mul_avx (&z11, &z9, &z2);
  127. /* 22 */ fe51_square (&t, &z11);
  128. /* 2^5 - 2^0 = 31 */ fe51_mul_avx (&z2_5_0, &t, &z9);
  129. /* 2^10 - 2^5 */ fe51_nsquare_avx (&t, &z2_5_0, 5);
  130. /* 2^10 - 2^0 */ fe51_mul_avx (&z2_10_0, &t, &z2_5_0);
  131. /* 2^20 - 2^10 */ fe51_nsquare_avx (&t, &z2_10_0, 10);
  132. /* 2^20 - 2^0 */ fe51_mul_avx (&z2_20_0, &t, &z2_10_0);
  133. /* 2^40 - 2^20 */ fe51_nsquare_avx (&t, &z2_20_0, 20);
  134. /* 2^40 - 2^0 */ fe51_mul_avx (&t, &t, &z2_20_0);
  135. /* 2^50 - 2^10 */ fe51_nsquare_avx (&t, &t, 10);
  136. /* 2^50 - 2^0 */ fe51_mul_avx (&z2_50_0, &t, &z2_10_0);
  137. /* 2^100 - 2^50 */ fe51_nsquare_avx (&t, &z2_50_0, 50);
  138. /* 2^100 - 2^0 */ fe51_mul_avx (&z2_100_0, &t, &z2_50_0);
  139. /* 2^200 - 2^100 */ fe51_nsquare_avx (&t, &z2_100_0, 100);
  140. /* 2^200 - 2^0 */ fe51_mul_avx (&t, &t, &z2_100_0);
  141. /* 2^250 - 2^50 */ fe51_nsquare_avx (&t, &t, 50);
  142. /* 2^250 - 2^0 */ fe51_mul_avx (&t, &t, &z2_50_0);
  143. /* 2^255 - 2^5 */ fe51_nsquare_avx (&t, &t, 5);
  144. /* 2^255 - 21 */ fe51_mul_avx (r, &t, &z11);
  145. }
  146. #define x1 var[0]
  147. #define x2 var[1]
  148. #define z2 var[2]
  149. void
  150. scalarmult_avx (unsigned char *q,
  151. const unsigned char *n,
  152. const unsigned char *p)
  153. {
  154. fe var[3];
  155. fe51 x_51;
  156. fe51 z_51;
  157. unsigned char e[32];
  158. memcpy (e, n, 32);
  159. e[0] &= 248;
  160. e[31] &= 127;
  161. e[31] |= 64;
  162. fe_frombytes (x1, p);
  163. ladder_avx (var, e);
  164. z_51.v[0] = (z2[1] << 26) + z2[0];
  165. z_51.v[1] = (z2[3] << 26) + z2[2];
  166. z_51.v[2] = (z2[5] << 26) + z2[4];
  167. z_51.v[3] = (z2[7] << 26) + z2[6];
  168. z_51.v[4] = (z2[9] << 26) + z2[8];
  169. x_51.v[0] = (x2[1] << 26) + x2[0];
  170. x_51.v[1] = (x2[3] << 26) + x2[2];
  171. x_51.v[2] = (x2[5] << 26) + x2[4];
  172. x_51.v[3] = (x2[7] << 26) + x2[6];
  173. x_51.v[4] = (x2[9] << 26) + x2[8];
  174. fe51_invert (&z_51, &z_51);
  175. fe51_mul_avx (&x_51, &x_51, &z_51);
  176. fe51_pack_avx (q, &x_51);
  177. }
  178. #undef x2
  179. #undef z2
  180. #define x2 var[0]
  181. #define z2 var[1]
  182. int
  183. scalarmult_base_avx (unsigned char *q, const unsigned char *n)
  184. {
  185. unsigned char e[32];
  186. fe var[3];
  187. fe51 x_51;
  188. fe51 z_51;
  189. memcpy (e, n, 32);
  190. e[0] &= 248;
  191. e[31] &= 127;
  192. e[31] |= 64;
  193. ladder_base_avx (var, e);
  194. z_51.v[0] = (z2[1] << 26) + z2[0];
  195. z_51.v[1] = (z2[3] << 26) + z2[2];
  196. z_51.v[2] = (z2[5] << 26) + z2[4];
  197. z_51.v[3] = (z2[7] << 26) + z2[6];
  198. z_51.v[4] = (z2[9] << 26) + z2[8];
  199. x_51.v[0] = (x2[1] << 26) + x2[0];
  200. x_51.v[1] = (x2[3] << 26) + x2[2];
  201. x_51.v[2] = (x2[5] << 26) + x2[4];
  202. x_51.v[3] = (x2[7] << 26) + x2[6];
  203. x_51.v[4] = (x2[9] << 26) + x2[8];
  204. fe51_invert (&z_51, &z_51);
  205. fe51_mul_avx (&x_51, &x_51, &z_51);
  206. fe51_pack_avx (q, &x_51);
  207. return 0;
  208. }