]> source.dussan.org Git - tigervnc.git/commitdiff
64-bit SIMD acceleration
authorDRC <dcommander@users.sourceforge.net>
Thu, 25 Jun 2009 20:38:31 +0000 (20:38 +0000)
committerDRC <dcommander@users.sourceforge.net>
Thu, 25 Jun 2009 20:38:31 +0000 (20:38 +0000)
git-svn-id: svn://svn.code.sf.net/p/tigervnc/code/trunk@3858 3789f03b-4d11-0410-bbf8-ca57d06f2519

24 files changed:
common/jpeg/acinclude.m4
common/jpeg/configure.ac
common/jpeg/jsimd.c
common/jpeg/simd/Makefile.am
common/jpeg/simd/jcclrss2-64.asm [new file with mode: 0644]
common/jpeg/simd/jccolss2-64.asm [new file with mode: 0644]
common/jpeg/simd/jcqnts2f-64.asm [new file with mode: 0644]
common/jpeg/simd/jcqnts2i-64.asm [new file with mode: 0644]
common/jpeg/simd/jcsamss2-64.asm [new file with mode: 0644]
common/jpeg/simd/jdclrss2-64.asm [new file with mode: 0644]
common/jpeg/simd/jdcolss2-64.asm [new file with mode: 0644]
common/jpeg/simd/jdmerss2-64.asm [new file with mode: 0644]
common/jpeg/simd/jdmrgss2-64.asm [new file with mode: 0644]
common/jpeg/simd/jdsamss2-64.asm [new file with mode: 0644]
common/jpeg/simd/jfss2fst-64.asm [new file with mode: 0644]
common/jpeg/simd/jfss2int-64.asm [new file with mode: 0644]
common/jpeg/simd/jfsseflt-64.asm [new file with mode: 0644]
common/jpeg/simd/jiss2flt-64.asm [new file with mode: 0644]
common/jpeg/simd/jiss2fst-64.asm [new file with mode: 0644]
common/jpeg/simd/jiss2int-64.asm [new file with mode: 0644]
common/jpeg/simd/jiss2red-64.asm [new file with mode: 0644]
common/jpeg/simd/jsimdcpu-64.asm [new file with mode: 0644]
common/jpeg/simd/jsimdext.inc
common/jpeg/simd/nasm_lt.sh

index 9150799b0f4498bfdfbea80ca470f2298f2efafa..f7d7b690b4e351aae262fd2d768709bb102db64f 100644 (file)
@@ -24,7 +24,14 @@ case "$host_os" in
     objfmt='a.out'
   ;;
   linux*)
-    objfmt='ELF'
+    case "$host_cpu" in
+      x86_64)
+        objfmt='ELF64'
+        ;;
+      *)
+        objfmt='ELF'
+        ;;
+    esac
   ;;
   freebsd* | netbsd* | openbsd*)
     if echo __ELF__ | $CC -E - | grep __ELF__ > /dev/null; then
@@ -58,6 +65,7 @@ case "$objfmt" in
   a.out)      NAFLAGS='-faout -DAOUT';;
   BSD-a.out)  NAFLAGS='-faoutb -DAOUT';;
   ELF)        NAFLAGS='-felf -DELF';;
+  ELF64)      NAFLAGS='-felf64 -DELF -D__x86_64__';;
   RDF)        NAFLAGS='-frdf -DRDF';;
   Mach-O)     NAFLAGS='-fmacho -DMACHO';;
 esac
@@ -68,7 +76,6 @@ AC_MSG_CHECKING([whether the assembler ($NASM $NAFLAGS) works])
 cat > conftest.asm <<EOF
 [%line __oline__ "configure"
         section .text
-        bits    32
         global  _main,main
 _main:
 main:   xor     eax,eax
index 02454588f75d97f52677ef63bef44269aea863db..3bcdf43078027d48fcae0621f6dcbbbeb9b4ff2a 100644 (file)
@@ -89,10 +89,17 @@ if test "x${with_simd}" != "xno"; then
   # Check if we're on a supported CPU
   AC_MSG_CHECKING([if host cpu type is i386 or compatible])
   case "$host_cpu" in
+    x86_64)
+      AC_MSG_RESULT(yes)
+      AC_PROG_NASM
+      AC_DEFINE([WITH_SIMD], [1], [Use accelerated SIMD routines.])
+      AC_DEFINE([WITH_SIMD64], [1], [Use x86-64 accelerated SIMD routines.])
+    ;;
     i*86 | x86 | ia32)
       AC_MSG_RESULT(yes)
       AC_PROG_NASM
       AC_DEFINE([WITH_SIMD], [1], [Use accelerated SIMD routines.])
+      SIMDDIR=simd
     ;;
     *)
       AC_MSG_RESULT([no ("$host_cpu")])
@@ -100,6 +107,7 @@ if test "x${with_simd}" != "xno"; then
     ;;
   esac
 fi
+AM_CONDITIONAL([WITH_SIMD64], [test "x$with_simd64" != "xno"])
 AM_CONDITIONAL([WITH_SIMD], [test "x$with_simd" != "xno"])
 
 # jconfig.h is the file we use, but we have another before that to
index 861309ab71d5f8bfd8d4e5ae4d7da1ce99139dac..20e83fc59da788577b0769d413001a09d4e7f350 100644 (file)
@@ -49,10 +49,12 @@ init_simd (void)
 
 #ifdef WITH_SIMD
   simd_support = jpeg_simd_cpu_support();
+  #ifndef __x86_64__
   if((env=getenv("JSIMD_FORCEMMX"))!=NULL && !strcmp(env, "1"))
     simd_support = JSIMD_MMX;
   else if((env=getenv("JSIMD_FORCESSE2"))!=NULL && !strcmp(env, "1"))
     simd_support = JSIMD_SSE2;
+  #endif
 #else
   simd_support = JSIMD_NONE;
 #endif
@@ -109,45 +111,63 @@ jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
 {
 #ifdef WITH_SIMD
   void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  #ifndef __x86_64__
   void (*mmxfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  #endif
   switch(cinfo->in_color_space)
   {
     case JCS_EXT_RGB:
       sse2fct=jsimd_extrgb_ycc_convert_sse2;
+      #ifndef __x86_64__
       mmxfct=jsimd_extrgb_ycc_convert_mmx;
+      #endif
       break;
     case JCS_EXT_RGBX:
       sse2fct=jsimd_extrgbx_ycc_convert_sse2;
+      #ifndef __x86_64__
       mmxfct=jsimd_extrgbx_ycc_convert_mmx;
+      #endif
       break;
     case JCS_EXT_BGR:
       sse2fct=jsimd_extbgr_ycc_convert_sse2;
+      #ifndef __x86_64__
       mmxfct=jsimd_extbgr_ycc_convert_mmx;
+      #endif
       break;
     case JCS_EXT_BGRX:
       sse2fct=jsimd_extbgrx_ycc_convert_sse2;
+      #ifndef __x86_64__
       mmxfct=jsimd_extbgrx_ycc_convert_mmx;
+      #endif
       break;
     case JCS_EXT_XBGR:
       sse2fct=jsimd_extxbgr_ycc_convert_sse2;
+      #ifndef __x86_64__
       mmxfct=jsimd_extxbgr_ycc_convert_mmx;
+      #endif
       break;
     case JCS_EXT_XRGB:
       sse2fct=jsimd_extxrgb_ycc_convert_sse2;
+      #ifndef __x86_64__
       mmxfct=jsimd_extxrgb_ycc_convert_mmx;
+      #endif
       break;
     default:
       sse2fct=jsimd_rgb_ycc_convert_sse2;
+      #ifndef __x86_64__
       mmxfct=jsimd_rgb_ycc_convert_mmx;
+      #endif
       break;
   }
   if ((simd_support & JSIMD_SSE2) &&
       IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
     sse2fct(cinfo->image_width, input_buf,
         output_buf, output_row, num_rows);
+  #ifndef __x86_64__
   else if (simd_support & JSIMD_MMX)
     mmxfct(cinfo->image_width, input_buf,
         output_buf, output_row, num_rows);
+  #endif
 #endif
 }
 
@@ -158,45 +178,63 @@ jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
 {
 #ifdef WITH_SIMD
   void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+  #ifndef __x86_64__
   void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+  #endif
   switch(cinfo->out_color_space)
   {
     case JCS_EXT_RGB:
       sse2fct=jsimd_ycc_extrgb_convert_sse2;
+      #ifndef __x86_64__
       mmxfct=jsimd_ycc_extrgb_convert_mmx;
+      #endif
       break;
     case JCS_EXT_RGBX:
       sse2fct=jsimd_ycc_extrgbx_convert_sse2;
+      #ifndef __x86_64__
       mmxfct=jsimd_ycc_extrgbx_convert_mmx;
+      #endif
       break;
     case JCS_EXT_BGR:
       sse2fct=jsimd_ycc_extbgr_convert_sse2;
+      #ifndef __x86_64__
       mmxfct=jsimd_ycc_extbgr_convert_mmx;
+      #endif
       break;
     case JCS_EXT_BGRX:
       sse2fct=jsimd_ycc_extbgrx_convert_sse2;
+      #ifndef __x86_64__
       mmxfct=jsimd_ycc_extbgrx_convert_mmx;
+      #endif
       break;
     case JCS_EXT_XBGR:
       sse2fct=jsimd_ycc_extxbgr_convert_sse2;
+      #ifndef __x86_64__
       mmxfct=jsimd_ycc_extxbgr_convert_mmx;
+      #endif
       break;
     case JCS_EXT_XRGB:
       sse2fct=jsimd_ycc_extxrgb_convert_sse2;
+      #ifndef __x86_64__
       mmxfct=jsimd_ycc_extxrgb_convert_mmx;
+      #endif
       break;
     default:
       sse2fct=jsimd_ycc_rgb_convert_sse2;
+      #ifndef __x86_64__
       mmxfct=jsimd_ycc_rgb_convert_mmx;
+      #endif
       break;
   }
   if ((simd_support & JSIMD_SSE2) &&
       IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
     sse2fct(cinfo->output_width, input_buf,
         input_row, output_buf, num_rows);
+  #ifndef __x86_64__
   else if (simd_support & JSIMD_MMX)
     mmxfct(cinfo->output_width, input_buf,
         input_row, output_buf, num_rows);
+  #endif
 #endif
 }
 
@@ -213,8 +251,10 @@ jsimd_can_h2v2_downsample (void)
 
   if (simd_support & JSIMD_SSE2)
     return 1;
+  #ifndef __x86_64__
   if (simd_support & JSIMD_MMX)
     return 1;
+  #endif
 
   return 0;
 }
@@ -232,8 +272,10 @@ jsimd_can_h2v1_downsample (void)
 
   if (simd_support & JSIMD_SSE2)
     return 1;
+  #ifndef __x86_64__
   if (simd_support & JSIMD_MMX)
     return 1;
+  #endif
 
   return 0;
 }
@@ -247,10 +289,12 @@ jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
     jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
         compptr->v_samp_factor, compptr->width_in_blocks,
         input_data, output_data);
+  #ifndef __x86_64__
   else if (simd_support & JSIMD_MMX)
     jsimd_h2v2_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
         compptr->v_samp_factor, compptr->width_in_blocks,
         input_data, output_data);
+  #endif
 #endif
 }
 
@@ -263,10 +307,12 @@ jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
     jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
         compptr->v_samp_factor, compptr->width_in_blocks,
         input_data, output_data);
+  #ifndef __x86_64__
   else if (simd_support & JSIMD_MMX)
     jsimd_h2v1_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
         compptr->v_samp_factor, compptr->width_in_blocks,
         input_data, output_data);
+  #endif
 #endif
 }
 
@@ -283,8 +329,10 @@ jsimd_can_h2v2_upsample (void)
 
   if (simd_support & JSIMD_SSE2)
     return 1;
+  #ifndef __x86_64__
   if (simd_support & JSIMD_MMX)
     return 1;
+  #endif
 
   return 0;
 }
@@ -302,8 +350,10 @@ jsimd_can_h2v1_upsample (void)
 
   if (simd_support & JSIMD_SSE2)
     return 1;
+  #ifndef __x86_64__
   if (simd_support & JSIMD_MMX)
     return 1;
+  #endif
 
   return 0;
 }
@@ -318,9 +368,11 @@ jsimd_h2v2_upsample (j_decompress_ptr cinfo,
   if (simd_support & JSIMD_SSE2)
     jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor,
         cinfo->output_width, input_data, output_data_ptr);
+  #ifndef __x86_64__
   else if (simd_support & JSIMD_MMX)
     jsimd_h2v2_upsample_mmx(cinfo->max_v_samp_factor,
         cinfo->output_width, input_data, output_data_ptr);
+  #endif
 #endif
 }
 
@@ -334,9 +386,11 @@ jsimd_h2v1_upsample (j_decompress_ptr cinfo,
   if (simd_support & JSIMD_SSE2)
     jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor,
         cinfo->output_width, input_data, output_data_ptr);
+  #ifndef __x86_64__
   else if (simd_support & JSIMD_MMX)
     jsimd_h2v1_upsample_mmx(cinfo->max_v_samp_factor,
         cinfo->output_width, input_data, output_data_ptr);
+  #endif
 #endif
 }
 
@@ -354,8 +408,10 @@ jsimd_can_h2v2_fancy_upsample (void)
   if ((simd_support & JSIMD_SSE2) &&
       IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
     return 1;
+  #ifndef __x86_64__
   if (simd_support & JSIMD_MMX)
     return 1;
+  #endif
 
   return 0;
 }
@@ -374,8 +430,10 @@ jsimd_can_h2v1_fancy_upsample (void)
   if ((simd_support & JSIMD_SSE2) &&
       IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
     return 1;
+  #ifndef __x86_64__
   if (simd_support & JSIMD_MMX)
     return 1;
+  #endif
 
   return 0;
 }
@@ -391,9 +449,11 @@ jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
       IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
     jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
         compptr->downsampled_width, input_data, output_data_ptr);
+  #ifndef __x86_64__
   else if (simd_support & JSIMD_MMX)
     jsimd_h2v2_fancy_upsample_mmx(cinfo->max_v_samp_factor,
         compptr->downsampled_width, input_data, output_data_ptr);
+  #endif
 #endif
 }
 
@@ -408,9 +468,11 @@ jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
       IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
     jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
         compptr->downsampled_width, input_data, output_data_ptr);
+  #ifndef __x86_64__
   else if (simd_support & JSIMD_MMX)
     jsimd_h2v1_fancy_upsample_mmx(cinfo->max_v_samp_factor,
         compptr->downsampled_width, input_data, output_data_ptr);
+  #endif
 #endif
 }
 
@@ -428,8 +490,10 @@ jsimd_can_h2v2_merged_upsample (void)
   if ((simd_support & JSIMD_SSE2) &&
       IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
     return 1;
+  #ifndef __x86_64__
   if (simd_support & JSIMD_MMX)
     return 1;
+  #endif
 
   return 0;
 }
@@ -448,8 +512,10 @@ jsimd_can_h2v1_merged_upsample (void)
   if ((simd_support & JSIMD_SSE2) &&
       IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
     return 1;
+  #ifndef __x86_64__
   if (simd_support & JSIMD_MMX)
     return 1;
+  #endif
 
   return 0;
 }
@@ -462,45 +528,63 @@ jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
 {
 #ifdef WITH_SIMD
   void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  #ifndef __x86_64__
   void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  #endif
   switch(cinfo->out_color_space)
   {
     case JCS_EXT_RGB:
       sse2fct=jsimd_h2v2_extrgb_merged_upsample_sse2;
+      #ifndef __x86_64__
       mmxfct=jsimd_h2v2_extrgb_merged_upsample_mmx;
+      #endif
       break;
     case JCS_EXT_RGBX:
       sse2fct=jsimd_h2v2_extrgbx_merged_upsample_sse2;
+      #ifndef __x86_64__
       mmxfct=jsimd_h2v2_extrgbx_merged_upsample_mmx;
+      #endif
       break;
     case JCS_EXT_BGR:
       sse2fct=jsimd_h2v2_extbgr_merged_upsample_sse2;
+      #ifndef __x86_64__
       mmxfct=jsimd_h2v2_extbgr_merged_upsample_mmx;
+      #endif
       break;
     case JCS_EXT_BGRX:
       sse2fct=jsimd_h2v2_extbgrx_merged_upsample_sse2;
+      #ifndef __x86_64__
       mmxfct=jsimd_h2v2_extbgrx_merged_upsample_mmx;
+      #endif
       break;
     case JCS_EXT_XBGR:
       sse2fct=jsimd_h2v2_extxbgr_merged_upsample_sse2;
+      #ifndef __x86_64__
       mmxfct=jsimd_h2v2_extxbgr_merged_upsample_mmx;
+      #endif
       break;
     case JCS_EXT_XRGB:
       sse2fct=jsimd_h2v2_extxrgb_merged_upsample_sse2;
+      #ifndef __x86_64__
       mmxfct=jsimd_h2v2_extxrgb_merged_upsample_mmx;
+      #endif
       break;
     default:
       sse2fct=jsimd_h2v2_merged_upsample_sse2;
+      #ifndef __x86_64__
       mmxfct=jsimd_h2v2_merged_upsample_mmx;
+      #endif
       break;
   }
   if ((simd_support & JSIMD_SSE2) &&
       IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
     sse2fct(cinfo->output_width, input_buf,
         in_row_group_ctr, output_buf);
+  #ifndef __x86_64__
   else if (simd_support & JSIMD_MMX)
     mmxfct(cinfo->output_width, input_buf,
         in_row_group_ctr, output_buf);
+  #endif
 #endif
 }
 
@@ -512,45 +596,63 @@ jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
 {
 #ifdef WITH_SIMD
   void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  #ifndef __x86_64__
   void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  #endif
   switch(cinfo->out_color_space)
   {
     case JCS_EXT_RGB:
       sse2fct=jsimd_h2v1_extrgb_merged_upsample_sse2;
+      #ifndef __x86_64__
       mmxfct=jsimd_h2v1_extrgb_merged_upsample_mmx;
+      #endif
       break;
     case JCS_EXT_RGBX:
       sse2fct=jsimd_h2v1_extrgbx_merged_upsample_sse2;
+      #ifndef __x86_64__
       mmxfct=jsimd_h2v1_extrgbx_merged_upsample_mmx;
+      #endif
       break;
     case JCS_EXT_BGR:
       sse2fct=jsimd_h2v1_extbgr_merged_upsample_sse2;
+      #ifndef __x86_64__
       mmxfct=jsimd_h2v1_extbgr_merged_upsample_mmx;
+      #endif
       break;
     case JCS_EXT_BGRX:
       sse2fct=jsimd_h2v1_extbgrx_merged_upsample_sse2;
+      #ifndef __x86_64__
       mmxfct=jsimd_h2v1_extbgrx_merged_upsample_mmx;
+      #endif
       break;
     case JCS_EXT_XBGR:
       sse2fct=jsimd_h2v1_extxbgr_merged_upsample_sse2;
+      #ifndef __x86_64__
       mmxfct=jsimd_h2v1_extxbgr_merged_upsample_mmx;
+      #endif
       break;
     case JCS_EXT_XRGB:
       sse2fct=jsimd_h2v1_extxrgb_merged_upsample_sse2;
+      #ifndef __x86_64__
       mmxfct=jsimd_h2v1_extxrgb_merged_upsample_mmx;
+      #endif
       break;
     default:
       sse2fct=jsimd_h2v1_merged_upsample_sse2;
+      #ifndef __x86_64__
       mmxfct=jsimd_h2v1_merged_upsample_mmx;
+      #endif
       break;
   }
   if ((simd_support & JSIMD_SSE2) &&
       IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
     sse2fct(cinfo->output_width, input_buf,
         in_row_group_ctr, output_buf);
+  #ifndef __x86_64__
   else if (simd_support & JSIMD_MMX)
     mmxfct(cinfo->output_width, input_buf,
         in_row_group_ctr, output_buf);
+  #endif
 #endif
 }
 
@@ -571,8 +673,10 @@ jsimd_can_convsamp (void)
 
   if (simd_support & JSIMD_SSE2)
     return 1;
+  #ifndef __x86_64__
   if (simd_support & JSIMD_MMX)
     return 1;
+  #endif
 
   return 0;
 }
@@ -596,8 +700,10 @@ jsimd_can_convsamp_float (void)
     return 1;
   if (simd_support & JSIMD_SSE)
     return 1;
+  #ifndef __x86_64__
   if (simd_support & JSIMD_3DNOW)
     return 1;
+  #endif
 
   return 0;
 }
@@ -609,8 +715,10 @@ jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
 #ifdef WITH_SIMD
   if (simd_support & JSIMD_SSE2)
     jsimd_convsamp_sse2(sample_data, start_col, workspace);
+  #ifndef __x86_64__
   else if (simd_support & JSIMD_MMX)
     jsimd_convsamp_mmx(sample_data, start_col, workspace);
+  #endif
 #endif
 }
 
@@ -621,10 +729,12 @@ jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
 #ifdef WITH_SIMD
   if (simd_support & JSIMD_SSE2)
     jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
+  #ifndef __x86_64__
   else if (simd_support & JSIMD_SSE)
     jsimd_convsamp_float_sse(sample_data, start_col, workspace);
   else if (simd_support & JSIMD_3DNOW)
     jsimd_convsamp_float_3dnow(sample_data, start_col, workspace);
+  #endif
 #endif
 }
 
@@ -641,8 +751,10 @@ jsimd_can_fdct_islow (void)
 
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
     return 1;
+  #ifndef __x86_64__
   if (simd_support & JSIMD_MMX)
     return 1;
+  #endif
 
   return 0;
 }
@@ -660,8 +772,10 @@ jsimd_can_fdct_ifast (void)
 
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
     return 1;
+  #ifndef __x86_64__
   if (simd_support & JSIMD_MMX)
     return 1;
+  #endif
 
   return 0;
 }
@@ -679,8 +793,10 @@ jsimd_can_fdct_float (void)
 
   if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
     return 1;
+  #ifndef __x86_64__
   if (simd_support & JSIMD_3DNOW)
     return 1;
+  #endif
 
   return 0;
 }
@@ -691,8 +807,10 @@ jsimd_fdct_islow (DCTELEM * data)
 #ifdef WITH_SIMD
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
     jsimd_fdct_islow_sse2(data);
+  #ifndef __x86_64__
   else if (simd_support & JSIMD_MMX)
     jsimd_fdct_islow_mmx(data);
+  #endif
 #endif
 }
 
@@ -702,8 +820,10 @@ jsimd_fdct_ifast (DCTELEM * data)
 #ifdef WITH_SIMD
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
     jsimd_fdct_ifast_sse2(data);
+  #ifndef __x86_64__
   else if (simd_support & JSIMD_MMX)
     jsimd_fdct_ifast_mmx(data);
+  #endif
 #endif
 }
 
@@ -713,8 +833,10 @@ jsimd_fdct_float (FAST_FLOAT * data)
 #ifdef WITH_SIMD
   if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
     jsimd_fdct_float_sse(data);
+  #ifndef __x86_64__
   else if (simd_support & JSIMD_3DNOW)
     jsimd_fdct_float_3dnow(data);
+  #endif
 #endif
 }
 
@@ -733,8 +855,10 @@ jsimd_can_quantize (void)
 
   if (simd_support & JSIMD_SSE2)
     return 1;
+  #ifndef __x86_64__
   if (simd_support & JSIMD_MMX)
     return 1;
+  #endif
 
   return 0;
 }
@@ -756,8 +880,10 @@ jsimd_can_quantize_float (void)
     return 1;
   if (simd_support & JSIMD_SSE)
     return 1;
+  #ifndef __x86_64__
   if (simd_support & JSIMD_3DNOW)
     return 1;
+  #endif
 
   return 0;
 }
@@ -769,8 +895,10 @@ jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
 #ifdef WITH_SIMD
   if (simd_support & JSIMD_SSE2)
     jsimd_quantize_sse2(coef_block, divisors, workspace);
+  #ifndef __x86_64__
   else if (simd_support & JSIMD_MMX)
     jsimd_quantize_mmx(coef_block, divisors, workspace);
+  #endif
 #endif
 }
 
@@ -781,10 +909,12 @@ jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
 #ifdef WITH_SIMD
   if (simd_support & JSIMD_SSE2)
     jsimd_quantize_float_sse2(coef_block, divisors, workspace);
+  #ifndef __x86_64__
   else if (simd_support & JSIMD_SSE)
     jsimd_quantize_float_sse(coef_block, divisors, workspace);
   else if (simd_support & JSIMD_3DNOW)
     jsimd_quantize_float_3dnow(coef_block, divisors, workspace);
+  #endif
 #endif
 }
 
@@ -807,8 +937,10 @@ jsimd_can_idct_2x2 (void)
 
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
     return 1;
+  #ifndef __x86_64__
   if (simd_support & JSIMD_MMX)
     return 1;
+  #endif
 
   return 0;
 }
@@ -832,8 +964,10 @@ jsimd_can_idct_4x4 (void)
 
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
     return 1;
+  #ifndef __x86_64__
   if (simd_support & JSIMD_MMX)
     return 1;
+  #endif
 
   return 0;
 }
@@ -846,8 +980,10 @@ jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 #if WITH_SIMD
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
     jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+  #ifndef __x86_64__
   else if (simd_support & JSIMD_MMX)
     jsimd_idct_2x2_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+  #endif
 #endif
 }
 
@@ -859,8 +995,10 @@ jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 #if WITH_SIMD
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
     jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+  #ifndef __x86_64__
   else if (simd_support & JSIMD_MMX)
     jsimd_idct_4x4_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+  #endif
 #endif
 }
 
@@ -883,8 +1021,10 @@ jsimd_can_idct_islow (void)
 
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
     return 1;
+  #ifndef __x86_64__
   if (simd_support & JSIMD_MMX)
     return 1;
+  #endif
 
   return 0;
 }
@@ -910,8 +1050,10 @@ jsimd_can_idct_ifast (void)
 
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
     return 1;
+  #ifndef __x86_64__
   if (simd_support & JSIMD_MMX)
     return 1;
+  #endif
 
   return 0;
 }
@@ -936,10 +1078,12 @@ jsimd_can_idct_float (void)
 
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
     return 1;
+  #ifndef __x86_64__
   if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
     return 1;
   if (simd_support & JSIMD_3DNOW)
     return 1;
+  #endif
 
   return 0;
 }
@@ -952,8 +1096,10 @@ jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 #if WITH_SIMD
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
     jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+  #ifndef __x86_64__
   else if (simd_support & JSIMD_MMX)
     jsimd_idct_islow_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+  #endif
 #endif
 }
 
@@ -965,8 +1111,10 @@ jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 #if WITH_SIMD
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
     jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+  #ifndef __x86_64__
   else if (simd_support & JSIMD_MMX)
     jsimd_idct_ifast_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+  #endif
 #endif
 }
 
@@ -979,12 +1127,14 @@ jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
     jsimd_idct_float_sse2(compptr->dct_table, coef_block,
         output_buf, output_col);
+  #ifndef __x86_64__
   else if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
     jsimd_idct_float_sse(compptr->dct_table, coef_block,
         output_buf, output_col);
   else if (simd_support & JSIMD_3DNOW)
     jsimd_idct_float_3dnow(compptr->dct_table, coef_block,
         output_buf, output_col);
+  #endif
 #endif
 }
 
index 78516f0165042d4ae000d3a7be5580a0bd9fb240..da7d6842dfe12c2d6770c88a86f7442c60c8e0f5 100644 (file)
@@ -4,6 +4,23 @@ BUILT_SOURCES = jsimdcfg.inc
 
 EXTRA_DIST = nasm_lt.sh
 
+if WITH_SIMD64
+
+libsimd_la_SOURCES = jsimd.h jsimdcfg.inc.h \
+       jsimdext.inc jcolsamp.inc jdct.inc \
+       jsimdcpu-64.asm jfsseflt-64.asm \
+       jccolss2-64.asm jdcolss2-64.asm \
+       jcsamss2-64.asm jdsamss2-64.asm jdmerss2-64.asm \
+       jcqnts2i-64.asm jfss2fst-64.asm jfss2int-64.asm \
+       jiss2red-64.asm jiss2int-64.asm jiss2fst-64.asm \
+       jcqnts2f-64.asm jiss2flt-64.asm
+
+jccolss2-64.lo: jcclrss2-64.asm
+jdcolss2-64.lo: jdclrss2-64.asm
+jdmerss2-64.lo: jdmrgss2-64.asm
+
+else
+
 libsimd_la_SOURCES = jsimd.h jsimdcfg.inc.h \
        jsimdext.inc jcolsamp.inc jdct.inc \
        jsimdcpu.asm \
@@ -26,6 +43,8 @@ jdcolss2.lo: jdclrss2.asm
 jdmermmx.lo: jdmrgmmx.asm
 jdmerss2.lo: jdmrgss2.asm
 
+endif
+
 .asm.lo:
        $(LIBTOOL) --mode=compile --tag NASM ./nasm_lt.sh $(NASM) $(NAFLAGS) $< -o $@
 
diff --git a/common/jpeg/simd/jcclrss2-64.asm b/common/jpeg/simd/jcclrss2-64.asm
new file mode 100644 (file)
index 0000000..9900edd
--- /dev/null
@@ -0,0 +1,484 @@
+;
+; jcclrss2.asm - colorspace conversion (64-bit SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; Copyright (C) 2009, D. R. Commander.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+       SECTION SEG_TEXT
+       BITS    64
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
+;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+;                             JDIMENSION output_row, int num_rows);
+;
+
+; r10 = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13 = JDIMENSION output_row
+; r14 = int num_rows
+
+%define wk(i)          rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM         8
+
+       align   16
+
+       global  EXTN(jsimd_rgb_ycc_convert_sse2)
+
+EXTN(jsimd_rgb_ycc_convert_sse2):
+       push    rbp
+       mov     rax,rsp                         ; rax = original rbp
+       sub     rsp, byte 4
+       and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+       mov     [rsp],rax
+       mov     rbp,rsp                         ; rbp = aligned rbp
+       lea     rsp, [wk(0)]
+       push    rbx
+       collect_args
+
+       mov     rcx, r10
+       test    rcx,rcx
+       jz      near .return
+
+       push    rcx
+
+       mov rsi, r12
+       mov rcx, r13
+       mov     rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+       mov     rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
+       mov     rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
+       lea     rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+       lea     rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+       lea     rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+       pop     rcx
+
+       mov rsi, r11
+       mov     rax, r14
+       test    rax,rax
+       jle     near .return
+.rowloop:
+       push    rdx
+       push    rbx
+       push    rdi
+       push    rsi
+       push    rcx                     ; col
+
+       mov     rsi, JSAMPROW [rsi]     ; inptr
+       mov     rdi, JSAMPROW [rdi]     ; outptr0
+       mov     rbx, JSAMPROW [rbx]     ; outptr1
+       mov     rdx, JSAMPROW [rdx]     ; outptr2
+
+       cmp     rcx, byte SIZEOF_XMMWORD
+       jae     near .columnloop
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+.column_ld1:
+       push    rax
+       push    rdx
+       lea     rcx,[rcx+rcx*2]         ; imul ecx,RGB_PIXELSIZE
+       test    cl, SIZEOF_BYTE
+       jz      short .column_ld2
+       sub     rcx, byte SIZEOF_BYTE
+       movzx   rax, BYTE [rsi+rcx]
+.column_ld2:
+       test    cl, SIZEOF_WORD
+       jz      short .column_ld4
+       sub     rcx, byte SIZEOF_WORD
+       movzx   rdx, WORD [rsi+rcx]
+       shl     rax, WORD_BIT
+       or      rax,rdx
+.column_ld4:
+       movd    xmmA,eax
+       pop     rdx
+       pop     rax
+       test    cl, SIZEOF_DWORD
+       jz      short .column_ld8
+       sub     rcx, byte SIZEOF_DWORD
+       movd    xmmF, XMM_DWORD [rsi+rcx]
+       pslldq  xmmA, SIZEOF_DWORD
+       por     xmmA,xmmF
+.column_ld8:
+       test    cl, SIZEOF_MMWORD
+       jz      short .column_ld16
+       sub     rcx, byte SIZEOF_MMWORD
+       movq    xmmB, XMM_MMWORD [rsi+rcx]
+       pslldq  xmmA, SIZEOF_MMWORD
+       por     xmmA,xmmB
+.column_ld16:
+       test    cl, SIZEOF_XMMWORD
+       jz      short .column_ld32
+       movdqa  xmmF,xmmA
+       movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+       mov     rcx, SIZEOF_XMMWORD
+       jmp     short .rgb_ycc_cnv
+.column_ld32:
+       test    cl, 2*SIZEOF_XMMWORD
+       mov     rcx, SIZEOF_XMMWORD
+       jz      short .rgb_ycc_cnv
+       movdqa  xmmB,xmmA
+       movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+       movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+       jmp     short .rgb_ycc_cnv
+
+.columnloop:
+       movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+       movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+       movdqu  xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+       ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+       ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+       ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+       movdqa    xmmG,xmmA
+       pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+       psrldq    xmmG,8        ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+       punpckhbw xmmA,xmmF     ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+       pslldq    xmmF,8        ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+       punpcklbw xmmG,xmmB     ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+       punpckhbw xmmF,xmmB     ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+       movdqa    xmmD,xmmA
+       pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+       psrldq    xmmD,8        ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+       punpckhbw xmmA,xmmG     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+       pslldq    xmmG,8        ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+       punpcklbw xmmD,xmmF     ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+       punpckhbw xmmG,xmmF     ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+       movdqa    xmmE,xmmA
+       pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+       psrldq    xmmE,8        ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+       punpckhbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+       pslldq    xmmD,8        ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+       punpcklbw xmmE,xmmG     ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+       punpckhbw xmmD,xmmG     ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+       pxor      xmmH,xmmH
+
+       movdqa    xmmC,xmmA
+       punpcklbw xmmA,xmmH     ; xmmA=(00 02 04 06 08 0A 0C 0E)
+       punpckhbw xmmC,xmmH     ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+       movdqa    xmmB,xmmE
+       punpcklbw xmmE,xmmH     ; xmmE=(20 22 24 26 28 2A 2C 2E)
+       punpckhbw xmmB,xmmH     ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+       movdqa    xmmF,xmmD
+       punpcklbw xmmD,xmmH     ; xmmD=(11 13 15 17 19 1B 1D 1F)
+       punpckhbw xmmF,xmmH     ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+       test    cl, SIZEOF_XMMWORD/16
+       jz      short .column_ld2
+       sub     rcx, byte SIZEOF_XMMWORD/16
+       movd    xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+       test    cl, SIZEOF_XMMWORD/8
+       jz      short .column_ld4
+       sub     rcx, byte SIZEOF_XMMWORD/8
+       movq    xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+       pslldq  xmmA, SIZEOF_MMWORD
+       por     xmmA,xmmE
+.column_ld4:
+       test    cl, SIZEOF_XMMWORD/4
+       jz      short .column_ld8
+       sub     rcx, byte SIZEOF_XMMWORD/4
+       movdqa  xmmE,xmmA
+       movdqu  xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld8:
+       test    cl, SIZEOF_XMMWORD/2
+       mov     rcx, SIZEOF_XMMWORD
+       jz      short .rgb_ycc_cnv
+       movdqa  xmmF,xmmA
+       movdqa  xmmH,xmmE
+       movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+       movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+       jmp     short .rgb_ycc_cnv
+
+.columnloop:
+       movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+       movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+       movdqu  xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+       movdqu  xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+       ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+       ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+       ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+       ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+       movdqa    xmmD,xmmA
+       punpcklbw xmmA,xmmE     ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+       punpckhbw xmmD,xmmE     ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+       movdqa    xmmC,xmmF
+       punpcklbw xmmF,xmmH     ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+       punpckhbw xmmC,xmmH     ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+       movdqa    xmmB,xmmA
+       punpcklwd xmmA,xmmF     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+       punpckhwd xmmB,xmmF     ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+       movdqa    xmmG,xmmD
+       punpcklwd xmmD,xmmC     ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+       punpckhwd xmmG,xmmC     ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+       movdqa    xmmE,xmmA
+       punpcklbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+       punpckhbw xmmE,xmmD     ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+       movdqa    xmmH,xmmB
+       punpcklbw xmmB,xmmG     ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+       punpckhbw xmmH,xmmG     ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+       pxor      xmmF,xmmF
+
+       movdqa    xmmC,xmmA
+       punpcklbw xmmA,xmmF     ; xmmA=(00 02 04 06 08 0A 0C 0E)
+       punpckhbw xmmC,xmmF     ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+       movdqa    xmmD,xmmB
+       punpcklbw xmmB,xmmF     ; xmmB=(01 03 05 07 09 0B 0D 0F)
+       punpckhbw xmmD,xmmF     ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+       movdqa    xmmG,xmmE
+       punpcklbw xmmE,xmmF     ; xmmE=(20 22 24 26 28 2A 2C 2E)
+       punpckhbw xmmG,xmmF     ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+       punpcklbw xmmF,xmmH
+       punpckhbw xmmH,xmmH
+       psrlw     xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
+       psrlw     xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+       ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+       ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+       ; (Original)
+       ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+       ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+       ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+       ;
+       ; (This implementation)
+       ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+       ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+       ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+       movdqa    XMMWORD [wk(0)], xmm0 ; wk(0)=RE
+       movdqa    XMMWORD [wk(1)], xmm1 ; wk(1)=RO
+       movdqa    XMMWORD [wk(2)], xmm4 ; wk(2)=BE
+       movdqa    XMMWORD [wk(3)], xmm5 ; wk(3)=BO
+
+       movdqa    xmm6,xmm1
+       punpcklwd xmm1,xmm3
+       punpckhwd xmm6,xmm3
+       movdqa    xmm7,xmm1
+       movdqa    xmm4,xmm6
+       pmaddwd   xmm1,[PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+       pmaddwd   xmm6,[PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+       pmaddwd   xmm7,[PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+       pmaddwd   xmm4,[PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+       movdqa    XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+       movdqa    XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+       pxor      xmm1,xmm1
+       pxor      xmm6,xmm6
+       punpcklwd xmm1,xmm5             ; xmm1=BOL
+       punpckhwd xmm6,xmm5             ; xmm6=BOH
+       psrld     xmm1,1                ; xmm1=BOL*FIX(0.500)
+       psrld     xmm6,1                ; xmm6=BOH*FIX(0.500)
+
+       movdqa    xmm5,[PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
+
+       paddd     xmm7,xmm1
+       paddd     xmm4,xmm6
+       paddd     xmm7,xmm5
+       paddd     xmm4,xmm5
+       psrld     xmm7,SCALEBITS        ; xmm7=CbOL
+       psrld     xmm4,SCALEBITS        ; xmm4=CbOH
+       packssdw  xmm7,xmm4             ; xmm7=CbO
+
+       movdqa    xmm1, XMMWORD [wk(2)] ; xmm1=BE
+
+       movdqa    xmm6,xmm0
+       punpcklwd xmm0,xmm2
+       punpckhwd xmm6,xmm2
+       movdqa    xmm5,xmm0
+       movdqa    xmm4,xmm6
+       pmaddwd   xmm0,[PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+       pmaddwd   xmm6,[PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+       pmaddwd   xmm5,[PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+       pmaddwd   xmm4,[PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+       movdqa    XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+       movdqa    XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+       pxor      xmm0,xmm0
+       pxor      xmm6,xmm6
+       punpcklwd xmm0,xmm1             ; xmm0=BEL
+       punpckhwd xmm6,xmm1             ; xmm6=BEH
+       psrld     xmm0,1                ; xmm0=BEL*FIX(0.500)
+       psrld     xmm6,1                ; xmm6=BEH*FIX(0.500)
+
+       movdqa    xmm1,[PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
+
+       paddd     xmm5,xmm0
+       paddd     xmm4,xmm6
+       paddd     xmm5,xmm1
+       paddd     xmm4,xmm1
+       psrld     xmm5,SCALEBITS        ; xmm5=CbEL
+       psrld     xmm4,SCALEBITS        ; xmm4=CbEH
+       packssdw  xmm5,xmm4             ; xmm5=CbE
+
+       psllw     xmm7,BYTE_BIT
+       por       xmm5,xmm7             ; xmm5=Cb
+       movdqa    XMMWORD [ebx], xmm5   ; Save Cb
+
+       movdqa    xmm0, XMMWORD [wk(3)] ; xmm0=BO
+       movdqa    xmm6, XMMWORD [wk(2)] ; xmm6=BE
+       movdqa    xmm1, XMMWORD [wk(1)] ; xmm1=RO
+
+       movdqa    xmm4,xmm0
+       punpcklwd xmm0,xmm3
+       punpckhwd xmm4,xmm3
+       movdqa    xmm7,xmm0
+       movdqa    xmm5,xmm4
+       pmaddwd   xmm0,[PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+       pmaddwd   xmm4,[PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+       pmaddwd   xmm7,[PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+       pmaddwd   xmm5,[PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+       movdqa    xmm3,[PD_ONEHALF]     ; xmm3=[PD_ONEHALF]
+
+       paddd     xmm0, XMMWORD [wk(4)]
+       paddd     xmm4, XMMWORD [wk(5)]
+       paddd     xmm0,xmm3
+       paddd     xmm4,xmm3
+       psrld     xmm0,SCALEBITS        ; xmm0=YOL
+       psrld     xmm4,SCALEBITS        ; xmm4=YOH
+       packssdw  xmm0,xmm4             ; xmm0=YO
+
+       pxor      xmm3,xmm3
+       pxor      xmm4,xmm4
+       punpcklwd xmm3,xmm1             ; xmm3=ROL
+       punpckhwd xmm4,xmm1             ; xmm4=ROH
+       psrld     xmm3,1                ; xmm3=ROL*FIX(0.500)
+       psrld     xmm4,1                ; xmm4=ROH*FIX(0.500)
+
+       movdqa    xmm1,[PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
+
+       paddd     xmm7,xmm3
+       paddd     xmm5,xmm4
+       paddd     xmm7,xmm1
+       paddd     xmm5,xmm1
+       psrld     xmm7,SCALEBITS        ; xmm7=CrOL
+       psrld     xmm5,SCALEBITS        ; xmm5=CrOH
+       packssdw  xmm7,xmm5             ; xmm7=CrO
+
+       movdqa    xmm3, XMMWORD [wk(0)] ; xmm3=RE
+
+       movdqa    xmm4,xmm6
+       punpcklwd xmm6,xmm2
+       punpckhwd xmm4,xmm2
+       movdqa    xmm1,xmm6
+       movdqa    xmm5,xmm4
+       pmaddwd   xmm6,[PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+       pmaddwd   xmm4,[PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+       pmaddwd   xmm1,[PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+       pmaddwd   xmm5,[PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+       movdqa    xmm2,[PD_ONEHALF]     ; xmm2=[PD_ONEHALF]
+
+       paddd     xmm6, XMMWORD [wk(6)]
+       paddd     xmm4, XMMWORD [wk(7)]
+       paddd     xmm6,xmm2
+       paddd     xmm4,xmm2
+       psrld     xmm6,SCALEBITS        ; xmm6=YEL
+       psrld     xmm4,SCALEBITS        ; xmm4=YEH
+       packssdw  xmm6,xmm4             ; xmm6=YE
+
+       psllw     xmm0,BYTE_BIT
+       por       xmm6,xmm0             ; xmm6=Y
+       movdqa    XMMWORD [rdi], xmm6   ; Save Y
+
+       pxor      xmm2,xmm2
+       pxor      xmm4,xmm4
+       punpcklwd xmm2,xmm3             ; xmm2=REL
+       punpckhwd xmm4,xmm3             ; xmm4=REH
+       psrld     xmm2,1                ; xmm2=REL*FIX(0.500)
+       psrld     xmm4,1                ; xmm4=REH*FIX(0.500)
+
+       movdqa    xmm0,[PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
+
+       paddd     xmm1,xmm2
+       paddd     xmm5,xmm4
+       paddd     xmm1,xmm0
+       paddd     xmm5,xmm0
+       psrld     xmm1,SCALEBITS        ; xmm1=CrEL
+       psrld     xmm5,SCALEBITS        ; xmm5=CrEH
+       packssdw  xmm1,xmm5             ; xmm1=CrE
+
+       psllw     xmm7,BYTE_BIT
+       por       xmm1,xmm7             ; xmm1=Cr
+       movdqa    XMMWORD [rdx], xmm1   ; Save Cr
+
+       sub     rcx, byte SIZEOF_XMMWORD
+       add     rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
+       add     rdi, byte SIZEOF_XMMWORD                ; outptr0
+       add     rbx, byte SIZEOF_XMMWORD                ; outptr1
+       add     rdx, byte SIZEOF_XMMWORD                ; outptr2
+       cmp     rcx, byte SIZEOF_XMMWORD
+       jae     near .columnloop
+       test    rcx,rcx
+       jnz     near .column_ld1
+
+       pop     rcx                     ; col
+       pop     rsi
+       pop     rdi
+       pop     rbx
+       pop     rdx
+
+       add     rsi, byte SIZEOF_JSAMPROW       ; input_buf
+       add     rdi, byte SIZEOF_JSAMPROW
+       add     rbx, byte SIZEOF_JSAMPROW
+       add     rdx, byte SIZEOF_JSAMPROW
+       dec     rax                             ; num_rows
+       jg      near .rowloop
+
+.return:
+       uncollect_args
+       pop     rbx
+       mov     rsp,rbp         ; rsp <- aligned rbp
+       pop     rsp             ; rsp <- original rbp
+       pop     rbp
+       ret
+
diff --git a/common/jpeg/simd/jccolss2-64.asm b/common/jpeg/simd/jccolss2-64.asm
new file mode 100644 (file)
index 0000000..a419d1b
--- /dev/null
@@ -0,0 +1,117 @@
+;
+; jccolss2.asm - colorspace conversion (64-bit SSE2)
+;
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; Copyright (C) 2009, D. R. Commander.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS      16
+
+F_0_081        equ      5329                   ; FIX(0.08131)
+F_0_114        equ      7471                   ; FIX(0.11400)
+F_0_168        equ     11059                   ; FIX(0.16874)
+F_0_250        equ     16384                   ; FIX(0.25000)
+F_0_299        equ     19595                   ; FIX(0.29900)
+F_0_331        equ     21709                   ; FIX(0.33126)
+F_0_418        equ     27439                   ; FIX(0.41869)
+F_0_587        equ     38470                   ; FIX(0.58700)
+F_0_337        equ     (F_0_587 - F_0_250)     ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+       SECTION SEG_CONST
+
+       alignz  16
+       global  EXTN(jconst_rgb_ycc_convert_sse2)
+
+EXTN(jconst_rgb_ycc_convert_sse2):
+
+PW_F0299_F0337 times 4 dw  F_0_299, F_0_337
+PW_F0114_F0250 times 4 dw  F_0_114, F_0_250
+PW_MF016_MF033 times 4 dw -F_0_168,-F_0_331
+PW_MF008_MF041 times 4 dw -F_0_081,-F_0_418
+PD_ONEHALFM1_CJ        times 4 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF     times 4 dd  (1 << (SCALEBITS-1))
+
+       alignz  16
+
+; --------------------------------------------------------------------------
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
+%include "jcclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
+%include "jcclrss2-64.asm"
diff --git a/common/jpeg/simd/jcqnts2f-64.asm b/common/jpeg/simd/jcqnts2f-64.asm
new file mode 100644 (file)
index 0000000..51d6307
--- /dev/null
@@ -0,0 +1,152 @@
+;
+; jcqnts2f.asm - sample data conversion and quantization (64-bit SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+       SECTION SEG_TEXT
+       BITS    64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                            FAST_FLOAT * workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11 = JDIMENSION start_col
+; r12 = FAST_FLOAT * workspace
+
+       align   16
+       global  EXTN(jsimd_convsamp_float_sse2)
+
+EXTN(jsimd_convsamp_float_sse2):
+       push    rbp
+       mov     rbp,rsp
+       push    rbx
+       collect_args
+
+       pcmpeqw  xmm7,xmm7
+       psllw    xmm7,7
+       packsswb xmm7,xmm7              ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+       mov rsi, r10
+       mov     rax, r11
+       mov rdi, r12
+       mov     rcx, DCTSIZE/2
+.convloop:
+       mov     rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
+       mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]       ; (JSAMPLE *)
+
+       movq    xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
+       movq    xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
+
+       psubb   xmm0,xmm7                       ; xmm0=(01234567)
+       psubb   xmm1,xmm7                       ; xmm1=(89ABCDEF)
+
+       punpcklbw xmm0,xmm0                     ; xmm0=(*0*1*2*3*4*5*6*7)
+       punpcklbw xmm1,xmm1                     ; xmm1=(*8*9*A*B*C*D*E*F)
+
+       punpcklwd xmm2,xmm0                     ; xmm2=(***0***1***2***3)
+       punpckhwd xmm0,xmm0                     ; xmm0=(***4***5***6***7)
+       punpcklwd xmm3,xmm1                     ; xmm3=(***8***9***A***B)
+       punpckhwd xmm1,xmm1                     ; xmm1=(***C***D***E***F)
+
+       psrad     xmm2,(DWORD_BIT-BYTE_BIT)     ; xmm2=(0123)
+       psrad     xmm0,(DWORD_BIT-BYTE_BIT)     ; xmm0=(4567)
+       cvtdq2ps  xmm2,xmm2                     ; xmm2=(0123)
+       cvtdq2ps  xmm0,xmm0                     ; xmm0=(4567)
+       psrad     xmm3,(DWORD_BIT-BYTE_BIT)     ; xmm3=(89AB)
+       psrad     xmm1,(DWORD_BIT-BYTE_BIT)     ; xmm1=(CDEF)
+       cvtdq2ps  xmm3,xmm3                     ; xmm3=(89AB)
+       cvtdq2ps  xmm1,xmm1                     ; xmm1=(CDEF)
+
+       movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
+       movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
+       movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+       movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
+
+       add     rsi, byte 2*SIZEOF_JSAMPROW
+       add     rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+       dec     rcx
+       jnz     short .convloop
+
+       uncollect_args
+       pop     rbx
+       pop     rbp
+       ret
+
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors,
+;                         FAST_FLOAT * workspace);
+;
+
+; r10 = JCOEFPTR coef_block
+; r11 = FAST_FLOAT * divisors
+; r12 = FAST_FLOAT * workspace
+
+       align   16
+       global  EXTN(jsimd_quantize_float_sse2)
+
+EXTN(jsimd_quantize_float_sse2):
+       push    rbp
+       mov     rbp,rsp
+       collect_args
+
+       mov rsi, r12
+       mov rdx, r11
+       mov rdi, r10
+       mov     rax, DCTSIZE2/16
+.quantloop:
+       movaps  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
+       movaps  xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
+       mulps   xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+       mulps   xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
+       movaps  xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
+       movaps  xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
+       mulps   xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+       mulps   xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
+
+       cvtps2dq xmm0,xmm0
+       cvtps2dq xmm1,xmm1
+       cvtps2dq xmm2,xmm2
+       cvtps2dq xmm3,xmm3
+
+       packssdw xmm0,xmm1
+       packssdw xmm2,xmm3
+
+       movdqa  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
+       movdqa  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
+
+       add     rsi, byte 16*SIZEOF_FAST_FLOAT
+       add     rdx, byte 16*SIZEOF_FAST_FLOAT
+       add     rdi, byte 16*SIZEOF_JCOEF
+       dec     rax
+       jnz     short .quantloop
+
+       uncollect_args
+       pop     rbp
+       ret
diff --git a/common/jpeg/simd/jcqnts2i-64.asm b/common/jpeg/simd/jcqnts2i-64.asm
new file mode 100644 (file)
index 0000000..d561590
--- /dev/null
@@ -0,0 +1,181 @@
+;
+; jcqnts2i.asm - sample data conversion and quantization (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+       SECTION SEG_TEXT
+       BITS    64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
+;                      DCTELEM * workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11 = JDIMENSION start_col
+; r12 = DCTELEM * workspace
+
+       align   16
+       global  EXTN(jsimd_convsamp_sse2)
+
+EXTN(jsimd_convsamp_sse2):
+       push    rbp
+       mov     rbp,rsp
+       push    rbx
+       collect_args
+
+       pxor    xmm6,xmm6               ; xmm6=(all 0's)
+       pcmpeqw xmm7,xmm7
+       psllw   xmm7,7                  ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+       mov rsi, r10
+       mov rax, r11
+       mov rdi, r12
+       mov     rcx, DCTSIZE/4
+.convloop:
+       mov     rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
+       mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]       ; (JSAMPLE *)
+
+       movq    xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]       ; xmm0=(01234567)
+       movq    xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]       ; xmm1=(89ABCDEF)
+
+       mov     rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
+       mov     rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
+
+       movq    xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]       ; xmm2=(GHIJKLMN)
+       movq    xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]       ; xmm3=(OPQRSTUV)
+
+       punpcklbw xmm0,xmm6             ; xmm0=(01234567)
+       punpcklbw xmm1,xmm6             ; xmm1=(89ABCDEF)
+       paddw     xmm0,xmm7
+       paddw     xmm1,xmm7
+       punpcklbw xmm2,xmm6             ; xmm2=(GHIJKLMN)
+       punpcklbw xmm3,xmm6             ; xmm3=(OPQRSTUV)
+       paddw     xmm2,xmm7
+       paddw     xmm3,xmm7
+
+       movdqa  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
+       movdqa  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
+       movdqa  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
+       movdqa  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
+
+       add     rsi, byte 4*SIZEOF_JSAMPROW
+       add     rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+       dec     rcx
+       jnz     short .convloop
+
+       uncollect_args
+       pop     rbx
+       pop     rbp
+       ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM * divisors,
+;                      DCTELEM * workspace);
+;
+
+%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
+%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
+%define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
+
+; r10 = JCOEFPTR coef_block
+; r11 = DCTELEM * divisors
+; r12 = DCTELEM * workspace
+
+       align   16
+       global  EXTN(jsimd_quantize_sse2)
+
+EXTN(jsimd_quantize_sse2):
+       push    rbp
+       mov     rbp,rsp
+       collect_args
+
+       mov rsi, r12
+       mov rdx, r11
+       mov rdi, r10
+       mov     rax, DCTSIZE2/32
+.quantloop:
+       movdqa  xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
+       movdqa  xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
+       movdqa  xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
+       movdqa  xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
+       movdqa  xmm0,xmm4
+       movdqa  xmm1,xmm5
+       movdqa  xmm2,xmm6
+       movdqa  xmm3,xmm7
+       psraw   xmm4,(WORD_BIT-1)
+       psraw   xmm5,(WORD_BIT-1)
+       psraw   xmm6,(WORD_BIT-1)
+       psraw   xmm7,(WORD_BIT-1)
+       pxor    xmm0,xmm4
+       pxor    xmm1,xmm5
+       pxor    xmm2,xmm6
+       pxor    xmm3,xmm7
+       psubw   xmm0,xmm4               ; if (xmm0 < 0) xmm0 = -xmm0;
+       psubw   xmm1,xmm5               ; if (xmm1 < 0) xmm1 = -xmm1;
+       psubw   xmm2,xmm6               ; if (xmm2 < 0) xmm2 = -xmm2;
+       psubw   xmm3,xmm7               ; if (xmm3 < 0) xmm3 = -xmm3;
+
+       paddw   xmm0, XMMWORD [CORRECTION(0,0,rdx)]  ; correction + roundfactor
+       paddw   xmm1, XMMWORD [CORRECTION(1,0,rdx)]
+       paddw   xmm2, XMMWORD [CORRECTION(2,0,rdx)]
+       paddw   xmm3, XMMWORD [CORRECTION(3,0,rdx)]
+       pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)]  ; reciprocal
+       pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
+       pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
+       pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
+       pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)]  ; scale
+       pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)]
+       pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)]
+       pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)]
+
+       pxor    xmm0,xmm4
+       pxor    xmm1,xmm5
+       pxor    xmm2,xmm6
+       pxor    xmm3,xmm7
+       psubw   xmm0,xmm4
+       psubw   xmm1,xmm5
+       psubw   xmm2,xmm6
+       psubw   xmm3,xmm7
+       movdqa  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
+       movdqa  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
+       movdqa  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
+       movdqa  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
+
+       add     rsi, byte 32*SIZEOF_DCTELEM
+       add     rdx, byte 32*SIZEOF_DCTELEM
+       add     rdi, byte 32*SIZEOF_JCOEF
+       dec     rax
+       jnz     near .quantloop
+
+       uncollect_args
+       pop     rbp
+       ret
diff --git a/common/jpeg/simd/jcsamss2-64.asm b/common/jpeg/simd/jcsamss2-64.asm
new file mode 100644 (file)
index 0000000..cf7a776
--- /dev/null
@@ -0,0 +1,324 @@
+;
+; jcsamss2.asm - downsampling (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+       SECTION SEG_TEXT
+       BITS    64
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
+;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+; r10 = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12 = JDIMENSION v_samp_factor
+; r13 = JDIMENSION width_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+       align   16
+       global  EXTN(jsimd_h2v1_downsample_sse2)
+
+EXTN(jsimd_h2v1_downsample_sse2):
+       push    rbp
+       mov     rbp,rsp
+       collect_args
+
+       mov rcx, r13
+       shl     rcx,3                   ; imul rcx,DCTSIZE (rcx = output_cols)
+       jz      near .return
+
+       mov rdx, r10
+
+       ; -- expand_right_edge
+
+       push    rcx
+       shl     rcx,1                           ; output_cols * 2
+       sub     rcx,rdx
+       jle     short .expand_end
+
+       mov     rax, r11
+       test    rax,rax
+       jle     short .expand_end
+
+       cld
+       mov     rsi, r14        ; input_data
+.expandloop:
+       push    rax
+       push    rcx
+
+       mov     rdi, JSAMPROW [rsi]
+       add     rdi,rdx
+       mov     al, JSAMPLE [rdi-1]
+
+       rep stosb
+
+       pop     rcx
+       pop     rax
+
+       add     rsi, byte SIZEOF_JSAMPROW
+       dec     rax
+       jg      short .expandloop
+
+.expand_end:
+       pop     rcx                             ; output_cols
+
+       ; -- h2v1_downsample
+
+       mov     rax, r12        ; rowctr
+       test    eax,eax
+       jle     near .return
+
+       mov     rdx, 0x00010000         ; bias pattern
+       movd    xmm7,edx
+       pcmpeqw xmm6,xmm6
+       pshufd  xmm7,xmm7,0x00          ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+       psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+       mov     rsi, r14        ; input_data
+       mov     rdi, r15        ; output_data
+.rowloop:
+       push    rcx
+       push    rdi
+       push    rsi
+
+       mov     rsi, JSAMPROW [rsi]             ; inptr
+       mov rdi, JSAMPROW [rdi]         ; outptr
+
+       cmp     rcx, byte SIZEOF_XMMWORD
+       jae     short .columnloop
+
+.columnloop_r8:
+       movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+       pxor    xmm1,xmm1
+       mov     rcx, SIZEOF_XMMWORD
+       jmp     short .downsample
+
+.columnloop:
+       movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+       movdqa  xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+.downsample:
+       movdqa  xmm2,xmm0
+       movdqa  xmm3,xmm1
+
+       pand    xmm0,xmm6
+       psrlw   xmm2,BYTE_BIT
+       pand    xmm1,xmm6
+       psrlw   xmm3,BYTE_BIT
+
+       paddw   xmm0,xmm2
+       paddw   xmm1,xmm3
+       paddw   xmm0,xmm7
+       paddw   xmm1,xmm7
+       psrlw   xmm0,1
+       psrlw   xmm1,1
+
+       packuswb xmm0,xmm1
+
+       movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+
+       sub     rcx, byte SIZEOF_XMMWORD        ; outcol
+       add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
+       add     rdi, byte 1*SIZEOF_XMMWORD      ; outptr
+       cmp     rcx, byte SIZEOF_XMMWORD
+       jae     short .columnloop
+       test    rcx,rcx
+       jnz     short .columnloop_r8
+
+       pop     rsi
+       pop     rdi
+       pop     rcx
+
+       add     rsi, byte SIZEOF_JSAMPROW       ; input_data
+       add     rdi, byte SIZEOF_JSAMPROW       ; output_data
+       dec     rax                             ; rowctr
+       jg      near .rowloop
+
+.return:
+       uncollect_args
+       pop     rbp
+       ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
+;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
+;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
+;
+
+; r10 = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12 = JDIMENSION v_samp_factor
+; r13 = JDIMENSION width_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+       align   16
+       global  EXTN(jsimd_h2v2_downsample_sse2)
+
+EXTN(jsimd_h2v2_downsample_sse2):
+       push    rbp
+       mov     rbp,rsp
+       collect_args
+
+       mov     rcx, r13
+       shl     rcx,3                   ; imul rcx,DCTSIZE (rcx = output_cols)
+       jz      near .return
+
+       mov     rdx, r10
+
+       ; -- expand_right_edge
+
+       push    rcx
+       shl     rcx,1                           ; output_cols * 2
+       sub     rcx,rdx
+       jle     short .expand_end
+
+       mov     rax, r11
+       test    rax,rax
+       jle     short .expand_end
+
+       cld
+       mov     rsi, r14        ; input_data
+.expandloop:
+       push    rax
+       push    rcx
+
+       mov     rdi, JSAMPROW [rsi]
+       add     rdi,rdx
+       mov     al, JSAMPLE [rdi-1]
+
+       rep stosb
+
+       pop     rcx
+       pop     rax
+
+       add     rsi, byte SIZEOF_JSAMPROW
+       dec     rax
+       jg      short .expandloop
+
+.expand_end:
+       pop     rcx                             ; output_cols
+
+       ; -- h2v2_downsample
+
+       mov     rax, r12        ; rowctr
+       test    rax,rax
+       jle     near .return
+
+       mov     rdx, 0x00020001         ; bias pattern
+       movd    xmm7,edx
+       pcmpeqw xmm6,xmm6
+       pshufd  xmm7,xmm7,0x00          ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+       psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+       mov     rsi, r14        ; input_data
+       mov     rdi, r15        ; output_data
+.rowloop:
+       push    rcx
+       push    rdi
+       push    rsi
+
+       mov     rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; inptr0
+       mov     rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]   ; inptr1
+       mov     rdi, JSAMPROW [rdi]                     ; outptr
+
+       cmp     rcx, byte SIZEOF_XMMWORD
+       jae     short .columnloop
+
+.columnloop_r8:
+       movdqa  xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+       movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+       pxor    xmm2,xmm2
+       pxor    xmm3,xmm3
+       mov     rcx, SIZEOF_XMMWORD
+       jmp     short .downsample
+
+.columnloop:
+       movdqa  xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+       movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+       movdqa  xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+       movdqa  xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+.downsample:
+       movdqa  xmm4,xmm0
+       movdqa  xmm5,xmm1
+       pand    xmm0,xmm6
+       psrlw   xmm4,BYTE_BIT
+       pand    xmm1,xmm6
+       psrlw   xmm5,BYTE_BIT
+       paddw   xmm0,xmm4
+       paddw   xmm1,xmm5
+
+       movdqa  xmm4,xmm2
+       movdqa  xmm5,xmm3
+       pand    xmm2,xmm6
+       psrlw   xmm4,BYTE_BIT
+       pand    xmm3,xmm6
+       psrlw   xmm5,BYTE_BIT
+       paddw   xmm2,xmm4
+       paddw   xmm3,xmm5
+
+       paddw   xmm0,xmm1
+       paddw   xmm2,xmm3
+       paddw   xmm0,xmm7
+       paddw   xmm2,xmm7
+       psrlw   xmm0,2
+       psrlw   xmm2,2
+
+       packuswb xmm0,xmm2
+
+       movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+
+       sub     rcx, byte SIZEOF_XMMWORD        ; outcol
+       add     rdx, byte 2*SIZEOF_XMMWORD      ; inptr0
+       add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr1
+       add     rdi, byte 1*SIZEOF_XMMWORD      ; outptr
+       cmp     rcx, byte SIZEOF_XMMWORD
+       jae     near .columnloop
+       test    rcx,rcx
+       jnz     near .columnloop_r8
+
+       pop     rsi
+       pop     rdi
+       pop     rcx
+
+       add     rsi, byte 2*SIZEOF_JSAMPROW     ; input_data
+       add     rdi, byte 1*SIZEOF_JSAMPROW     ; output_data
+       dec     rax                             ; rowctr
+       jg      near .rowloop
+
+.return:
+       uncollect_args
+       pop     rbp
+       ret
diff --git a/common/jpeg/simd/jdclrss2-64.asm b/common/jpeg/simd/jdclrss2-64.asm
new file mode 100644 (file)
index 0000000..0808eca
--- /dev/null
@@ -0,0 +1,483 @@
+;
+; jdclrss2.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+                               
+; --------------------------------------------------------------------------
+       SECTION SEG_TEXT
+       BITS    64
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
+;                             JSAMPIMAGE input_buf, JDIMENSION input_row,
+;                             JSAMPARRAY output_buf, int num_rows)
+;
+
+; r10 = JDIMENSION out_width
+; r11 = JSAMPIMAGE input_buf
+; r12 = JDIMENSION input_row
+; r13 = JSAMPARRAY output_buf
+; r14 = int num_rows
+
+%define wk(i)          rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+
+       align   16
+       global  EXTN(jsimd_ycc_rgb_convert_sse2)
+
+EXTN(jsimd_ycc_rgb_convert_sse2):
+       push    rbp
+       mov     rax,rsp                         ; rax = original rbp
+       sub     rsp, byte 4
+       and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+       mov     [rsp],rax
+       mov     rbp,rsp                         ; rbp = aligned rbp
+       lea     rsp, [wk(0)]
+       push    rbx
+       collect_args
+
+       mov     rcx, r10        ; num_cols
+       test    rcx,rcx
+       jz      near .return
+
+       push    rcx
+
+       mov     rdi, r11
+       mov     rcx, r12
+       mov     rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+       mov     rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+       mov     rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+       lea     rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+       lea     rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+       lea     rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+       pop     rcx
+
+       mov     rdi, r13
+       mov     rax, r14
+       test    rax,rax
+       jle     near .return
+.rowloop:
+       push    rax
+       push    rdi
+       push    rdx
+       push    rbx
+       push    rsi
+       push    rcx                     ; col
+
+       mov     rsi, JSAMPROW [rsi]     ; inptr0
+       mov     rbx, JSAMPROW [rbx]     ; inptr1
+       mov     rdx, JSAMPROW [rdx]     ; inptr2
+       mov     rdi, JSAMPROW [rdi]     ; outptr
+.columnloop:
+
+       movdqa  xmm5, XMMWORD [rbx]     ; xmm5=Cb(0123456789ABCDEF)
+       movdqa  xmm1, XMMWORD [rdx]     ; xmm1=Cr(0123456789ABCDEF)
+
+       pcmpeqw xmm4,xmm4
+       pcmpeqw xmm7,xmm7
+       psrlw   xmm4,BYTE_BIT
+       psllw   xmm7,7                  ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+       movdqa  xmm0,xmm4               ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+       pand    xmm4,xmm5               ; xmm4=Cb(02468ACE)=CbE
+       psrlw   xmm5,BYTE_BIT           ; xmm5=Cb(13579BDF)=CbO
+       pand    xmm0,xmm1               ; xmm0=Cr(02468ACE)=CrE
+       psrlw   xmm1,BYTE_BIT           ; xmm1=Cr(13579BDF)=CrO
+
+       paddw   xmm4,xmm7
+       paddw   xmm5,xmm7
+       paddw   xmm0,xmm7
+       paddw   xmm1,xmm7
+
+       ; (Original)
+       ; R = Y                + 1.40200 * Cr
+       ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+       ; B = Y + 1.77200 * Cb
+       ;
+       ; (This implementation)
+       ; R = Y                + 0.40200 * Cr + Cr
+       ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+       ; B = Y - 0.22800 * Cb + Cb + Cb
+
+       movdqa  xmm2,xmm4               ; xmm2=CbE
+       movdqa  xmm3,xmm5               ; xmm3=CbO
+       paddw   xmm4,xmm4               ; xmm4=2*CbE
+       paddw   xmm5,xmm5               ; xmm5=2*CbO
+       movdqa  xmm6,xmm0               ; xmm6=CrE
+       movdqa  xmm7,xmm1               ; xmm7=CrO
+       paddw   xmm0,xmm0               ; xmm0=2*CrE
+       paddw   xmm1,xmm1               ; xmm1=2*CrO
+
+       pmulhw  xmm4,[PW_MF0228]        ; xmm4=(2*CbE * -FIX(0.22800))
+       pmulhw  xmm5,[PW_MF0228]        ; xmm5=(2*CbO * -FIX(0.22800))
+       pmulhw  xmm0,[PW_F0402] ; xmm0=(2*CrE * FIX(0.40200))
+       pmulhw  xmm1,[PW_F0402] ; xmm1=(2*CrO * FIX(0.40200))
+
+       paddw   xmm4,[PW_ONE]
+       paddw   xmm5,[PW_ONE]
+       psraw   xmm4,1                  ; xmm4=(CbE * -FIX(0.22800))
+       psraw   xmm5,1                  ; xmm5=(CbO * -FIX(0.22800))
+       paddw   xmm0,[PW_ONE]
+       paddw   xmm1,[PW_ONE]
+       psraw   xmm0,1                  ; xmm0=(CrE * FIX(0.40200))
+       psraw   xmm1,1                  ; xmm1=(CrO * FIX(0.40200))
+
+       paddw   xmm4,xmm2
+       paddw   xmm5,xmm3
+       paddw   xmm4,xmm2               ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+       paddw   xmm5,xmm3               ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+       paddw   xmm0,xmm6               ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+       paddw   xmm1,xmm7               ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+       movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=(B-Y)E
+       movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(B-Y)O
+
+       movdqa    xmm4,xmm2
+       movdqa    xmm5,xmm3
+       punpcklwd xmm2,xmm6
+       punpckhwd xmm4,xmm6
+       pmaddwd   xmm2,[PW_MF0344_F0285]
+       pmaddwd   xmm4,[PW_MF0344_F0285]
+       punpcklwd xmm3,xmm7
+       punpckhwd xmm5,xmm7
+       pmaddwd   xmm3,[PW_MF0344_F0285]
+       pmaddwd   xmm5,[PW_MF0344_F0285]
+
+       paddd     xmm2,[PD_ONEHALF]
+       paddd     xmm4,[PD_ONEHALF]
+       psrad     xmm2,SCALEBITS
+       psrad     xmm4,SCALEBITS
+       paddd     xmm3,[PD_ONEHALF]
+       paddd     xmm5,[PD_ONEHALF]
+       psrad     xmm3,SCALEBITS
+       psrad     xmm5,SCALEBITS
+
+       packssdw  xmm2,xmm4     ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+       packssdw  xmm3,xmm5     ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+       psubw     xmm2,xmm6     ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+       psubw     xmm3,xmm7     ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+       movdqa    xmm5, XMMWORD [rsi]   ; xmm5=Y(0123456789ABCDEF)
+
+       pcmpeqw   xmm4,xmm4
+       psrlw     xmm4,BYTE_BIT         ; xmm4={0xFF 0x00 0xFF 0x00 ..}
+       pand      xmm4,xmm5             ; xmm4=Y(02468ACE)=YE
+       psrlw     xmm5,BYTE_BIT         ; xmm5=Y(13579BDF)=YO
+
+       paddw     xmm0,xmm4             ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+       paddw     xmm1,xmm5             ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+       packuswb  xmm0,xmm0             ; xmm0=R(02468ACE********)
+       packuswb  xmm1,xmm1             ; xmm1=R(13579BDF********)
+
+       paddw     xmm2,xmm4             ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+       paddw     xmm3,xmm5             ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+       packuswb  xmm2,xmm2             ; xmm2=G(02468ACE********)
+       packuswb  xmm3,xmm3             ; xmm3=G(13579BDF********)
+
+       paddw     xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+       paddw     xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+       packuswb  xmm4,xmm4             ; xmm4=B(02468ACE********)
+       packuswb  xmm5,xmm5             ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+       ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+       ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+       ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+       ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+       punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+       punpcklbw xmmE,xmmB     ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+       punpcklbw xmmD,xmmF     ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+       movdqa    xmmG,xmmA
+       movdqa    xmmH,xmmA
+       punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+       punpckhwd xmmG,xmmE     ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+       psrldq    xmmH,2        ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+       psrldq    xmmE,2        ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+       movdqa    xmmC,xmmD
+       movdqa    xmmB,xmmD
+       punpcklwd xmmD,xmmH     ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+       punpckhwd xmmC,xmmH     ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+       psrldq    xmmB,2        ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+       movdqa    xmmF,xmmE
+       punpcklwd xmmE,xmmB     ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+       punpckhwd xmmF,xmmB     ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+       pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+       movdqa    xmmB,xmmE
+       punpckldq xmmA,xmmD     ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+       punpckldq xmmE,xmmH     ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+       punpckhdq xmmD,xmmB     ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+       pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+       movdqa    xmmB,xmmF
+       punpckldq xmmG,xmmC     ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+       punpckldq xmmF,xmmH     ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+       punpckhdq xmmC,xmmB     ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+       punpcklqdq xmmA,xmmE    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+       punpcklqdq xmmD,xmmG    ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+       punpcklqdq xmmF,xmmC    ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+       cmp     rcx, byte SIZEOF_XMMWORD
+       jb      short .column_st32
+
+       test    rdi, SIZEOF_XMMWORD-1
+       jnz     short .out1
+       ; --(aligned)-------------------
+       movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+       movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+       movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+       add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+       jmp     short .out0
+.out1: ; --(unaligned)-----------------
+       pcmpeqb    xmmH,xmmH                    ; xmmH=(all 1's)
+       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [rdi], xmmA
+       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       maskmovdqu xmmD,xmmH                    ; movntdqu XMMWORD [rdi], xmmD
+       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       maskmovdqu xmmF,xmmH                    ; movntdqu XMMWORD [rdi], xmmF
+       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+.out0:
+       sub     rcx, byte SIZEOF_XMMWORD
+       jz      near .nextrow
+
+       add     rsi, byte SIZEOF_XMMWORD        ; inptr0
+       add     rbx, byte SIZEOF_XMMWORD        ; inptr1
+       add     rdx, byte SIZEOF_XMMWORD        ; inptr2
+       jmp     near .columnloop
+
+.column_st32:
+       pcmpeqb xmmH,xmmH                       ; xmmH=(all 1's)
+       lea     rcx, [rcx+rcx*2]                ; imul ecx, RGB_PIXELSIZE
+       cmp     rcx, byte 2*SIZEOF_XMMWORD
+       jb      short .column_st16
+       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [rdi], xmmA
+       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       maskmovdqu xmmD,xmmH                    ; movntdqu XMMWORD [rdi], xmmD
+       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       movdqa  xmmA,xmmF
+       sub     rcx, byte 2*SIZEOF_XMMWORD
+       jmp     short .column_st15
+.column_st16:
+       cmp     rcx, byte SIZEOF_XMMWORD
+       jb      short .column_st15
+       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [rdi], xmmA
+       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       movdqa  xmmA,xmmD
+       sub     rcx, byte SIZEOF_XMMWORD
+.column_st15:
+       mov     rax,rcx
+       xor     rcx, byte 0x0F
+       shl     rcx, 2
+       movd    xmmB,ecx
+       psrlq   xmmH,4
+       pcmpeqb xmmE,xmmE
+       psrlq   xmmH,xmmB
+       psrlq   xmmE,xmmB
+       punpcklbw xmmE,xmmH
+       ; ----------------
+       mov     rcx,rdi
+       and     rcx, byte SIZEOF_XMMWORD-1
+       jz      short .adj0
+       add     rax,rcx
+       cmp     rax, byte SIZEOF_XMMWORD
+       ja      short .adj0
+       and     rdi, byte (-SIZEOF_XMMWORD)     ; align to 16-byte boundary
+       shl     rcx, 3                  ; pslldq xmmA,ecx & pslldq xmmE,rcx
+       movdqa  xmmG,xmmA
+       movdqa  xmmC,xmmE
+       pslldq  xmmA, SIZEOF_XMMWORD/2
+       pslldq  xmmE, SIZEOF_XMMWORD/2
+       movd    xmmD,ecx
+       sub     rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+       jb      short .adj1
+       movd    xmmF,ecx
+       psllq   xmmA,xmmF
+       psllq   xmmE,xmmF
+       jmp     short .adj0
+.adj1: neg     ecx
+       movd    xmmF,ecx
+       psrlq   xmmA,xmmF
+       psrlq   xmmE,xmmF
+       psllq   xmmG,xmmD
+       psllq   xmmC,xmmD
+       por     xmmA,xmmG
+       por     xmmE,xmmC
+.adj0: ; ----------------
+       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [rdi], xmmA
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+       pcmpeqb   xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
+       pcmpeqb   xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
+%else
+       pxor      xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
+       pxor      xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
+%endif
+       ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+       ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+       ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+       ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+       punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+       punpcklbw xmmE,xmmG     ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+       punpcklbw xmmB,xmmD     ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+       punpcklbw xmmF,xmmH     ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+       movdqa    xmmC,xmmA
+       punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+       punpckhwd xmmC,xmmE     ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+       movdqa    xmmG,xmmB
+       punpcklwd xmmB,xmmF     ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+       punpckhwd xmmG,xmmF     ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+       movdqa    xmmD,xmmA
+       punpckldq xmmA,xmmB     ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+       punpckhdq xmmD,xmmB     ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+       movdqa    xmmH,xmmC
+       punpckldq xmmC,xmmG     ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+       punpckhdq xmmH,xmmG     ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+       cmp     rcx, byte SIZEOF_XMMWORD
+       jb      short .column_st32
+
+       test    rdi, SIZEOF_XMMWORD-1
+       jnz     short .out1
+       ; --(aligned)-------------------
+       movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+       movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+       movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+       movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+       add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+       jmp     short .out0
+.out1: ; --(unaligned)-----------------
+       pcmpeqb    xmmE,xmmE                    ; xmmE=(all 1's)
+       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [rdi], xmmA
+       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       maskmovdqu xmmD,xmmE                    ; movntdqu XMMWORD [rdi], xmmD
+       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       maskmovdqu xmmC,xmmE                    ; movntdqu XMMWORD [rdi], xmmC
+       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       maskmovdqu xmmH,xmmE                    ; movntdqu XMMWORD [rdi], xmmH
+       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+.out0:
+       sub     rcx, byte SIZEOF_XMMWORD
+       jz      near .nextrow
+
+       add     rsi, byte SIZEOF_XMMWORD        ; inptr0
+       add     rbx, byte SIZEOF_XMMWORD        ; inptr1
+       add     rdx, byte SIZEOF_XMMWORD        ; inptr2
+       jmp     near .columnloop
+
+.column_st32:
+       pcmpeqb xmmE,xmmE                       ; xmmE=(all 1's)
+       cmp     rcx, byte SIZEOF_XMMWORD/2
+       jb      short .column_st16
+       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [rdi], xmmA
+       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       maskmovdqu xmmD,xmmE                    ; movntdqu XMMWORD [rdi], xmmD
+       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       movdqa  xmmA,xmmC
+       movdqa  xmmD,xmmH
+       sub     rcx, byte SIZEOF_XMMWORD/2
+.column_st16:
+       cmp     rcx, byte SIZEOF_XMMWORD/4
+       jb      short .column_st15
+       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [rdi], xmmA
+       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       movdqa  xmmA,xmmD
+       sub     rcx, byte SIZEOF_XMMWORD/4
+.column_st15:
+       cmp     rcx, byte SIZEOF_XMMWORD/16
+       jb      near .nextrow
+       mov     rax,rcx
+       xor     rcx, byte 0x03
+       inc     rcx
+       shl     rcx, 4
+       movd    xmmF,ecx
+       psrlq   xmmE,xmmF
+       punpcklbw xmmE,xmmE
+       ; ----------------
+       mov     rcx,rdi
+       and     rcx, byte SIZEOF_XMMWORD-1
+       jz      short .adj0
+       lea     rax, [rcx+rax*4]        ; RGB_PIXELSIZE
+       cmp     rax, byte SIZEOF_XMMWORD
+       ja      short .adj0
+       and     rdi, byte (-SIZEOF_XMMWORD)     ; align to 16-byte boundary
+       shl     rcx, 3                  ; pslldq xmmA,ecx & pslldq xmmE,ecx
+       movdqa  xmmB,xmmA
+       movdqa  xmmG,xmmE
+       pslldq  xmmA, SIZEOF_XMMWORD/2
+       pslldq  xmmE, SIZEOF_XMMWORD/2
+       movd    xmmC,ecx
+       sub     rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+       jb      short .adj1
+       movd    xmmH,ecx
+       psllq   xmmA,xmmH
+       psllq   xmmE,xmmH
+       jmp     short .adj0
+.adj1: neg     rcx
+       movd    xmmH,ecx
+       psrlq   xmmA,xmmH
+       psrlq   xmmE,xmmH
+       psllq   xmmB,xmmC
+       psllq   xmmG,xmmC
+       por     xmmA,xmmB
+       por     xmmE,xmmG
+.adj0: ; ----------------
+       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [rdi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.nextrow:
+       pop     rcx
+       pop     rsi
+       pop     rbx
+       pop     rdx
+       pop     rdi
+       pop     rax
+
+       add     rsi, byte SIZEOF_JSAMPROW
+       add     rbx, byte SIZEOF_JSAMPROW
+       add     rdx, byte SIZEOF_JSAMPROW
+       add     rdi, byte SIZEOF_JSAMPROW       ; output_buf
+       dec     rax                             ; num_rows
+       jg      near .rowloop
+
+       sfence          ; flush the write buffer
+
+.return:
+       uncollect_args
+       pop     rbx
+       mov     rsp,rbp         ; rsp <- aligned rbp
+       pop     rsp             ; rsp <- original rbp
+       pop     rbp
+       ret
diff --git a/common/jpeg/simd/jdcolss2-64.asm b/common/jpeg/simd/jdcolss2-64.asm
new file mode 100644 (file)
index 0000000..5e8a322
--- /dev/null
@@ -0,0 +1,117 @@
+;
+; jdcolss2.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS      16
+
+F_0_344        equ      22554                  ; FIX(0.34414)
+F_0_714        equ      46802                  ; FIX(0.71414)
+F_1_402        equ      91881                  ; FIX(1.40200)
+F_1_772        equ     116130                  ; FIX(1.77200)
+F_0_402        equ     (F_1_402 - 65536)       ; FIX(1.40200) - FIX(1)
+F_0_285        equ     ( 65536 - F_0_714)      ; FIX(1) - FIX(0.71414)
+F_0_228        equ     (131072 - F_1_772)      ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+       SECTION SEG_CONST
+
+       alignz  16
+       global  EXTN(jconst_ycc_rgb_convert_sse2)
+
+EXTN(jconst_ycc_rgb_convert_sse2):
+
+PW_F0402       times 8 dw  F_0_402
+PW_MF0228      times 8 dw -F_0_228
+PW_MF0344_F0285        times 4 dw -F_0_344, F_0_285
+PW_ONE         times 8 dw  1
+PD_ONEHALF     times 4 dd  1 << (SCALEBITS-1)
+
+       alignz  16
+
+; --------------------------------------------------------------------------
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
+%include "jdclrss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
+%include "jdclrss2-64.asm"
diff --git a/common/jpeg/simd/jdmerss2-64.asm b/common/jpeg/simd/jdmerss2-64.asm
new file mode 100644 (file)
index 0000000..2f9c5c1
--- /dev/null
@@ -0,0 +1,123 @@
+;
+; jdmerss2.asm - merged upsampling/color conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS      16
+
+F_0_344        equ      22554                  ; FIX(0.34414)
+F_0_714        equ      46802                  ; FIX(0.71414)
+F_1_402        equ      91881                  ; FIX(1.40200)
+F_1_772        equ     116130                  ; FIX(1.77200)
+F_0_402        equ     (F_1_402 - 65536)       ; FIX(1.40200) - FIX(1)
+F_0_285        equ     ( 65536 - F_0_714)      ; FIX(1) - FIX(0.71414)
+F_0_228        equ     (131072 - F_1_772)      ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+       SECTION SEG_CONST
+
+       alignz  16
+       global  EXTN(jconst_merged_upsample_sse2)
+
+EXTN(jconst_merged_upsample_sse2):
+
+PW_F0402       times 8 dw  F_0_402
+PW_MF0228      times 8 dw -F_0_228
+PW_MF0344_F0285        times 4 dw -F_0_344, F_0_285
+PW_ONE         times 8 dw  1
+PD_ONEHALF     times 4 dd  1 << (SCALEBITS-1)
+
+       alignz  16
+
+; --------------------------------------------------------------------------
+%include "jdmrgss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgb_merged_upsample_sse2
+%include "jdmrgss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgbx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgbx_merged_upsample_sse2
+%include "jdmrgss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgr_merged_upsample_sse2
+%include "jdmrgss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgrx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgrx_merged_upsample_sse2
+%include "jdmrgss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxbgr_merged_upsample_sse2
+%include "jdmrgss2-64.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxrgb_merged_upsample_sse2
+%include "jdmrgss2-64.asm"
diff --git a/common/jpeg/simd/jdmrgss2-64.asm b/common/jpeg/simd/jdmrgss2-64.asm
new file mode 100644 (file)
index 0000000..6474f43
--- /dev/null
@@ -0,0 +1,565 @@
+;
+; jdmrgss2.asm - merged upsampling/color conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+                               
+; --------------------------------------------------------------------------
+       SECTION SEG_TEXT
+       BITS    64
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width,
+;                                  JSAMPIMAGE input_buf,
+;                                  JDIMENSION in_row_group_ctr,
+;                                  JSAMPARRAY output_buf);
+;
+
+; r10 = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12 = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+%define wk(i)          rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM         3
+
+       align   16
+       global  EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+EXTN(jsimd_h2v1_merged_upsample_sse2):
+       push    rbp
+       mov     rax,rsp                         ; rax = original rbp
+       sub     rsp, byte 4
+       and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+       mov     [rsp],rax
+       mov     rbp,rsp                         ; rbp = aligned rbp
+       lea     rsp, [wk(0)]
+       push    rbx
+       collect_args
+
+       mov     rcx, r10        ; col
+       test    rcx,rcx
+       jz      near .return
+
+       push    rcx
+
+       mov     rdi, r11
+       mov     rcx, r12
+       mov     rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+       mov     rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+       mov     rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+       mov     rdi, r13
+       mov     rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW]         ; inptr0
+       mov     rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW]         ; inptr1
+       mov     rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW]         ; inptr2
+       mov     rdi, JSAMPROW [rdi]                             ; outptr
+
+       pop     rcx                     ; col
+
+.columnloop:
+
+       movdqa    xmm6, XMMWORD [rbx]   ; xmm6=Cb(0123456789ABCDEF)
+       movdqa    xmm7, XMMWORD [rdx]   ; xmm7=Cr(0123456789ABCDEF)
+
+       pxor      xmm1,xmm1             ; xmm1=(all 0's)
+       pcmpeqw   xmm3,xmm3
+       psllw     xmm3,7                ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+       movdqa    xmm4,xmm6
+       punpckhbw xmm6,xmm1             ; xmm6=Cb(89ABCDEF)=CbH
+       punpcklbw xmm4,xmm1             ; xmm4=Cb(01234567)=CbL
+       movdqa    xmm0,xmm7
+       punpckhbw xmm7,xmm1             ; xmm7=Cr(89ABCDEF)=CrH
+       punpcklbw xmm0,xmm1             ; xmm0=Cr(01234567)=CrL
+
+       paddw     xmm6,xmm3
+       paddw     xmm4,xmm3
+       paddw     xmm7,xmm3
+       paddw     xmm0,xmm3
+
+       ; (Original)
+       ; R = Y                + 1.40200 * Cr
+       ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+       ; B = Y + 1.77200 * Cb
+       ;
+       ; (This implementation)
+       ; R = Y                + 0.40200 * Cr + Cr
+       ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+       ; B = Y - 0.22800 * Cb + Cb + Cb
+
+       movdqa  xmm5,xmm6               ; xmm5=CbH
+       movdqa  xmm2,xmm4               ; xmm2=CbL
+       paddw   xmm6,xmm6               ; xmm6=2*CbH
+       paddw   xmm4,xmm4               ; xmm4=2*CbL
+       movdqa  xmm1,xmm7               ; xmm1=CrH
+       movdqa  xmm3,xmm0               ; xmm3=CrL
+       paddw   xmm7,xmm7               ; xmm7=2*CrH
+       paddw   xmm0,xmm0               ; xmm0=2*CrL
+
+       pmulhw  xmm6,[PW_MF0228]        ; xmm6=(2*CbH * -FIX(0.22800))
+       pmulhw  xmm4,[PW_MF0228]        ; xmm4=(2*CbL * -FIX(0.22800))
+       pmulhw  xmm7,[PW_F0402] ; xmm7=(2*CrH * FIX(0.40200))
+       pmulhw  xmm0,[PW_F0402] ; xmm0=(2*CrL * FIX(0.40200))
+
+       paddw   xmm6,[PW_ONE]
+       paddw   xmm4,[PW_ONE]
+       psraw   xmm6,1                  ; xmm6=(CbH * -FIX(0.22800))
+       psraw   xmm4,1                  ; xmm4=(CbL * -FIX(0.22800))
+       paddw   xmm7,[PW_ONE]
+       paddw   xmm0,[PW_ONE]
+       psraw   xmm7,1                  ; xmm7=(CrH * FIX(0.40200))
+       psraw   xmm0,1                  ; xmm0=(CrL * FIX(0.40200))
+
+       paddw   xmm6,xmm5
+       paddw   xmm4,xmm2
+       paddw   xmm6,xmm5               ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
+       paddw   xmm4,xmm2               ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
+       paddw   xmm7,xmm1               ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
+       paddw   xmm0,xmm3               ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
+
+       movdqa  XMMWORD [wk(0)], xmm6   ; wk(0)=(B-Y)H
+       movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=(R-Y)H
+
+       movdqa    xmm6,xmm5
+       movdqa    xmm7,xmm2
+       punpcklwd xmm5,xmm1
+       punpckhwd xmm6,xmm1
+       pmaddwd   xmm5,[PW_MF0344_F0285]
+       pmaddwd   xmm6,[PW_MF0344_F0285]
+       punpcklwd xmm2,xmm3
+       punpckhwd xmm7,xmm3
+       pmaddwd   xmm2,[PW_MF0344_F0285]
+       pmaddwd   xmm7,[PW_MF0344_F0285]
+
+       paddd     xmm5,[PD_ONEHALF]
+       paddd     xmm6,[PD_ONEHALF]
+       psrad     xmm5,SCALEBITS
+       psrad     xmm6,SCALEBITS
+       paddd     xmm2,[PD_ONEHALF]
+       paddd     xmm7,[PD_ONEHALF]
+       psrad     xmm2,SCALEBITS
+       psrad     xmm7,SCALEBITS
+
+       packssdw  xmm5,xmm6     ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+       packssdw  xmm2,xmm7     ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+       psubw     xmm5,xmm1     ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+       psubw     xmm2,xmm3     ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+       movdqa  XMMWORD [wk(2)], xmm5   ; wk(2)=(G-Y)H
+
+       mov     al,2                    ; Yctr
+       jmp     short .Yloop_1st
+
+.Yloop_2nd:
+       movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(R-Y)H
+       movdqa  xmm2, XMMWORD [wk(2)]   ; xmm2=(G-Y)H
+       movdqa  xmm4, XMMWORD [wk(0)]   ; xmm4=(B-Y)H
+
+.Yloop_1st:
+       movdqa  xmm7, XMMWORD [rsi]     ; xmm7=Y(0123456789ABCDEF)
+
+       pcmpeqw xmm6,xmm6
+       psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+       pand    xmm6,xmm7               ; xmm6=Y(02468ACE)=YE
+       psrlw   xmm7,BYTE_BIT           ; xmm7=Y(13579BDF)=YO
+
+       movdqa  xmm1,xmm0               ; xmm1=xmm0=(R-Y)(L/H)
+       movdqa  xmm3,xmm2               ; xmm3=xmm2=(G-Y)(L/H)
+       movdqa  xmm5,xmm4               ; xmm5=xmm4=(B-Y)(L/H)
+
+       paddw     xmm0,xmm6             ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
+       paddw     xmm1,xmm7             ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
+       packuswb  xmm0,xmm0             ; xmm0=R(02468ACE********)
+       packuswb  xmm1,xmm1             ; xmm1=R(13579BDF********)
+
+       paddw     xmm2,xmm6             ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
+       paddw     xmm3,xmm7             ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
+       packuswb  xmm2,xmm2             ; xmm2=G(02468ACE********)
+       packuswb  xmm3,xmm3             ; xmm3=G(13579BDF********)
+
+       paddw     xmm4,xmm6             ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
+       paddw     xmm5,xmm7             ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
+       packuswb  xmm4,xmm4             ; xmm4=B(02468ACE********)
+       packuswb  xmm5,xmm5             ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+       ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+       ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+       ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+       ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+       punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+       punpcklbw xmmE,xmmB     ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+       punpcklbw xmmD,xmmF     ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+       movdqa    xmmG,xmmA
+       movdqa    xmmH,xmmA
+       punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+       punpckhwd xmmG,xmmE     ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+       psrldq    xmmH,2        ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+       psrldq    xmmE,2        ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+       movdqa    xmmC,xmmD
+       movdqa    xmmB,xmmD
+       punpcklwd xmmD,xmmH     ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+       punpckhwd xmmC,xmmH     ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+       psrldq    xmmB,2        ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+       movdqa    xmmF,xmmE
+       punpcklwd xmmE,xmmB     ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+       punpckhwd xmmF,xmmB     ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+       pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+       movdqa    xmmB,xmmE
+       punpckldq xmmA,xmmD     ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+       punpckldq xmmE,xmmH     ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+       punpckhdq xmmD,xmmB     ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+       pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+       movdqa    xmmB,xmmF
+       punpckldq xmmG,xmmC     ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+       punpckldq xmmF,xmmH     ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+       punpckhdq xmmC,xmmB     ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+       punpcklqdq xmmA,xmmE    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+       punpcklqdq xmmD,xmmG    ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+       punpcklqdq xmmF,xmmC    ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+       cmp     rcx, byte SIZEOF_XMMWORD
+       jb      short .column_st32
+
+       test    rdi, SIZEOF_XMMWORD-1
+       jnz     short .out1
+       ; --(aligned)-------------------
+       movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+       movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+       movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+       add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+       jmp     short .out0
+.out1: ; --(unaligned)-----------------
+       pcmpeqb    xmmH,xmmH                    ; xmmH=(all 1's)
+       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [rdi], xmmA
+       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       maskmovdqu xmmD,xmmH                    ; movntdqu XMMWORD [rdi], xmmD
+       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       maskmovdqu xmmF,xmmH                    ; movntdqu XMMWORD [rdi], xmmF
+       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+.out0:
+       sub     rcx, byte SIZEOF_XMMWORD
+       jz      near .endcolumn
+
+       add     rsi, byte SIZEOF_XMMWORD        ; inptr0
+       dec     al                      ; Yctr
+       jnz     near .Yloop_2nd
+
+       add     rbx, byte SIZEOF_XMMWORD        ; inptr1
+       add     rdx, byte SIZEOF_XMMWORD        ; inptr2
+       jmp     near .columnloop
+
+.column_st32:
+       pcmpeqb xmmH,xmmH                       ; xmmH=(all 1's)
+       lea     rcx, [rcx+rcx*2]                ; imul ecx, RGB_PIXELSIZE
+       cmp     rcx, byte 2*SIZEOF_XMMWORD
+       jb      short .column_st16
+       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [rdi], xmmA
+       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       maskmovdqu xmmD,xmmH                    ; movntdqu XMMWORD [rdi], xmmD
+       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       movdqa  xmmA,xmmF
+       sub     rcx, byte 2*SIZEOF_XMMWORD
+       jmp     short .column_st15
+.column_st16:
+       cmp     rcx, byte SIZEOF_XMMWORD
+       jb      short .column_st15
+       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [rdi], xmmA
+       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       movdqa  xmmA,xmmD
+       sub     rcx, byte SIZEOF_XMMWORD
+.column_st15:
+       mov     rax,rcx
+       xor     rcx, byte 0x0F
+       shl     rcx, 2
+       movd    xmmB,ecx
+       psrlq   xmmH,4
+       pcmpeqb xmmE,xmmE
+       psrlq   xmmH,xmmB
+       psrlq   xmmE,xmmB
+       punpcklbw xmmE,xmmH
+       ; ----------------
+       mov     rcx,rdi
+       and     rcx, byte SIZEOF_XMMWORD-1
+       jz      short .adj0
+       add     rax,rcx
+       cmp     rax, byte SIZEOF_XMMWORD
+       ja      short .adj0
+       and     rdi, byte (-SIZEOF_XMMWORD)     ; align to 16-byte boundary
+       shl     rcx, 3                  ; pslldq xmmA,ecx & pslldq xmmE,ecx
+       movdqa  xmmG,xmmA
+       movdqa  xmmC,xmmE
+       pslldq  xmmA, SIZEOF_XMMWORD/2
+       pslldq  xmmE, SIZEOF_XMMWORD/2
+       movd    xmmD,ecx
+       sub     rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+       jb      short .adj1
+       movd    xmmF,ecx
+       psllq   xmmA,xmmF
+       psllq   xmmE,xmmF
+       jmp     short .adj0
+.adj1: neg     rcx
+       movd    xmmF,ecx
+       psrlq   xmmA,xmmF
+       psrlq   xmmE,xmmF
+       psllq   xmmG,xmmD
+       psllq   xmmC,xmmD
+       por     xmmA,xmmG
+       por     xmmE,xmmC
+.adj0: ; ----------------
+       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+       pcmpeqb   xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
+       pcmpeqb   xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
+%else
+       pxor      xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
+       pxor      xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
+%endif
+       ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+       ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+       ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+       ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+       punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+       punpcklbw xmmE,xmmG     ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+       punpcklbw xmmB,xmmD     ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+       punpcklbw xmmF,xmmH     ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+       movdqa    xmmC,xmmA
+       punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+       punpckhwd xmmC,xmmE     ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+       movdqa    xmmG,xmmB
+       punpcklwd xmmB,xmmF     ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+       punpckhwd xmmG,xmmF     ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+       movdqa    xmmD,xmmA
+       punpckldq xmmA,xmmB     ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+       punpckhdq xmmD,xmmB     ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+       movdqa    xmmH,xmmC
+       punpckldq xmmC,xmmG     ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+       punpckhdq xmmH,xmmG     ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+       cmp     rcx, byte SIZEOF_XMMWORD
+       jb      short .column_st32
+
+       test    rdi, SIZEOF_XMMWORD-1
+       jnz     short .out1
+       ; --(aligned)-------------------
+       movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+       movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+       movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+       movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+       add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+       jmp     short .out0
+.out1: ; --(unaligned)-----------------
+       pcmpeqb    xmmE,xmmE                    ; xmmE=(all 1's)
+       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [rdi], xmmA
+       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       maskmovdqu xmmD,xmmE                    ; movntdqu XMMWORD [rdi], xmmD
+       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       maskmovdqu xmmC,xmmE                    ; movntdqu XMMWORD [rdi], xmmC
+       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       maskmovdqu xmmH,xmmE                    ; movntdqu XMMWORD [rdi], xmmH
+       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+.out0:
+       sub     rcx, byte SIZEOF_XMMWORD
+       jz      near .endcolumn
+
+       add     rsi, byte SIZEOF_XMMWORD        ; inptr0
+       dec     al                      ; Yctr
+       jnz     near .Yloop_2nd
+
+       add     rbx, byte SIZEOF_XMMWORD        ; inptr1
+       add     rdx, byte SIZEOF_XMMWORD        ; inptr2
+       jmp     near .columnloop
+
+.column_st32:
+       pcmpeqb xmmE,xmmE                       ; xmmE=(all 1's)
+       cmp     rcx, byte SIZEOF_XMMWORD/2
+       jb      short .column_st16
+       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [rdi], xmmA
+       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       maskmovdqu xmmD,xmmE                    ; movntdqu XMMWORD [rdi], xmmD
+       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       movdqa  xmmA,xmmC
+       movdqa  xmmD,xmmH
+       sub     rcx, byte SIZEOF_XMMWORD/2
+.column_st16:
+       cmp     rcx, byte SIZEOF_XMMWORD/4
+       jb      short .column_st15
+       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
+       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       movdqa  xmmA,xmmD
+       sub     rcx, byte SIZEOF_XMMWORD/4
+.column_st15:
+       cmp     rcx, byte SIZEOF_XMMWORD/16
+       jb      near .endcolumn
+       mov     rax,rcx
+       xor     rcx, byte 0x03
+       inc     rcx
+       shl     rcx, 4
+       movd    xmmF,ecx
+       psrlq   xmmE,xmmF
+       punpcklbw xmmE,xmmE
+       ; ----------------
+       mov     rcx,rdi
+       and     rcx, byte SIZEOF_XMMWORD-1
+       jz      short .adj0
+       lea     rax, [rcx+rax*4]        ; RGB_PIXELSIZE
+       cmp     rax, byte SIZEOF_XMMWORD
+       ja      short .adj0
+       and     rdi, byte (-SIZEOF_XMMWORD)     ; align to 16-byte boundary
+       shl     rcx, 3                  ; pslldq xmmA,ecx & pslldq xmmE,ecx
+       movdqa  xmmB,xmmA
+       movdqa  xmmG,xmmE
+       pslldq  xmmA, SIZEOF_XMMWORD/2
+       pslldq  xmmE, SIZEOF_XMMWORD/2
+       movd    xmmC,ecx
+       sub     rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+       jb      short .adj1
+       movd    xmmH,ecx
+       psllq   xmmA,xmmH
+       psllq   xmmE,xmmH
+       jmp     short .adj0
+.adj1: neg     rcx
+       movd    xmmH,ecx
+       psrlq   xmmA,xmmH
+       psrlq   xmmE,xmmH
+       psllq   xmmB,xmmC
+       psllq   xmmG,xmmC
+       por     xmmA,xmmB
+       por     xmmE,xmmG
+.adj0: ; ----------------
+       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+       sfence          ; flush the write buffer
+
+.return:
+       uncollect_args
+       pop     rbx
+       mov     rsp,rbp         ; rsp <- aligned rbp
+       pop     rsp             ; rsp <- original rbp
+       pop     rbp
+       ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width,
+;                                  JSAMPIMAGE input_buf,
+;                                  JDIMENSION in_row_group_ctr,
+;                                  JSAMPARRAY output_buf);
+;
+
+; r10 = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12 = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+       align   16
+       global  EXTN(jsimd_h2v2_merged_upsample_sse2)
+
+EXTN(jsimd_h2v2_merged_upsample_sse2):
+       push    rbp
+       mov     rbp,rsp
+       push    rbx
+       collect_args
+
+       mov     rax, r10
+
+       mov     rdi, r11
+       mov     rcx, r12
+       mov     rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+       mov     rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+       mov     rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+       mov     rdi, r13
+       lea     rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+
+       push    rdx                     ; inptr2
+       push    rbx                     ; inptr1
+       push    rsi                     ; inptr00
+       mov     rbx,rsp
+
+       push    rdi
+       push    rcx
+       push    rax
+
+       mov rdx, rcx
+       mov rcx, rdi
+       mov     rdi, rax
+       mov rsi, rbx
+
+       call    EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+       pop rax
+       pop rcx
+       pop rdi
+       pop rsi
+       pop rbx
+       pop rdx
+
+       add     rdi, byte SIZEOF_JSAMPROW       ; outptr1
+       add     rsi, byte SIZEOF_JSAMPROW       ; inptr01
+
+       push    rdx                     ; inptr2
+       push    rbx                     ; inptr1
+       push    rsi                     ; inptr00
+       mov     rbx,rsp
+
+       push    rdi
+       push    rcx
+       push    rax
+
+       mov rdx, rcx
+       mov rcx, rdi
+       mov     rdi, rax
+       mov rsi, rbx
+
+       call    EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+       pop rax
+       pop rcx
+       pop rdi
+       pop rsi
+       pop rbx
+       pop rdx
+
+       uncollect_args
+       pop     rbx
+       pop     rbp
+       ret
diff --git a/common/jpeg/simd/jdsamss2-64.asm b/common/jpeg/simd/jdsamss2-64.asm
new file mode 100644 (file)
index 0000000..1f7b1f5
--- /dev/null
@@ -0,0 +1,664 @@
+;
+; jdsamss2.asm - upsampling (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+       SECTION SEG_CONST
+
+       alignz  16
+       global  EXTN(jconst_fancy_upsample_sse2)
+
+EXTN(jconst_fancy_upsample_sse2):
+
+PW_ONE         times 8 dw  1
+PW_TWO         times 8 dw  2
+PW_THREE       times 8 dw  3
+PW_SEVEN       times 8 dw  7
+PW_EIGHT       times 8 dw  8
+
+       alignz  16
+
+; --------------------------------------------------------------------------
+       SECTION SEG_TEXT
+       BITS    64
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
+;                                 JDIMENSION downsampled_width,
+;                                 JSAMPARRAY input_data,
+;                                 JSAMPARRAY * output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11 = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY * output_data_ptr
+
+       align   16
+       global  EXTN(jsimd_h2v1_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v1_fancy_upsample_sse2):
+       push    rbp
+       mov     rbp,rsp
+       collect_args
+
+       mov     rax, r11  ; colctr
+       test    rax,rax
+       jz      near .return
+
+       mov     rcx, r10        ; rowctr
+       test    rcx,rcx
+       jz      near .return
+
+       mov     rsi, r12        ; input_data
+       mov     rdi, r13
+       mov     rdi, JSAMPARRAY [rdi]                   ; output_data
+.rowloop:
+       push    rax                     ; colctr
+       push    rdi
+       push    rsi
+
+       mov     rsi, JSAMPROW [rsi]     ; inptr
+       mov     rdi, JSAMPROW [rdi]     ; outptr
+
+       test    rax, SIZEOF_XMMWORD-1
+       jz      short .skip
+       mov     dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+       mov     JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+       pxor    xmm0,xmm0               ; xmm0=(all 0's)
+       pcmpeqb xmm7,xmm7
+       psrldq  xmm7,(SIZEOF_XMMWORD-1)
+       pand    xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+       add     rax, byte SIZEOF_XMMWORD-1
+       and     rax, byte -SIZEOF_XMMWORD
+       cmp     rax, byte SIZEOF_XMMWORD
+       ja      short .columnloop
+
+.columnloop_last:
+       pcmpeqb xmm6,xmm6
+       pslldq  xmm6,(SIZEOF_XMMWORD-1)
+       pand    xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+       jmp     short .upsample
+
+.columnloop:
+       movdqa  xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+       pslldq  xmm6,(SIZEOF_XMMWORD-1)
+
+.upsample:
+       movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+       movdqa  xmm2,xmm1
+       movdqa  xmm3,xmm1               ; xmm1=( 0  1  2 ... 13 14 15)
+       pslldq  xmm2,1                  ; xmm2=(--  0  1 ... 12 13 14)
+       psrldq  xmm3,1                  ; xmm3=( 1  2  3 ... 14 15 --)
+
+       por     xmm2,xmm7               ; xmm2=(-1  0  1 ... 12 13 14)
+       por     xmm3,xmm6               ; xmm3=( 1  2  3 ... 14 15 16)
+
+       movdqa  xmm7,xmm1
+       psrldq  xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
+
+       movdqa    xmm4,xmm1
+       punpcklbw xmm1,xmm0             ; xmm1=( 0  1  2  3  4  5  6  7)
+       punpckhbw xmm4,xmm0             ; xmm4=( 8  9 10 11 12 13 14 15)
+       movdqa    xmm5,xmm2
+       punpcklbw xmm2,xmm0             ; xmm2=(-1  0  1  2  3  4  5  6)
+       punpckhbw xmm5,xmm0             ; xmm5=( 7  8  9 10 11 12 13 14)
+       movdqa    xmm6,xmm3
+       punpcklbw xmm3,xmm0             ; xmm3=( 1  2  3  4  5  6  7  8)
+       punpckhbw xmm6,xmm0             ; xmm6=( 9 10 11 12 13 14 15 16)
+
+       pmullw  xmm1,[PW_THREE]
+       pmullw  xmm4,[PW_THREE]
+       paddw   xmm2,[PW_ONE]
+       paddw   xmm5,[PW_ONE]
+       paddw   xmm3,[PW_TWO]
+       paddw   xmm6,[PW_TWO]
+
+       paddw   xmm2,xmm1
+       paddw   xmm5,xmm4
+       psrlw   xmm2,2                  ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
+       psrlw   xmm5,2                  ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
+       paddw   xmm3,xmm1
+       paddw   xmm6,xmm4
+       psrlw   xmm3,2                  ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
+       psrlw   xmm6,2                  ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
+
+       psllw   xmm3,BYTE_BIT
+       psllw   xmm6,BYTE_BIT
+       por     xmm2,xmm3               ; xmm2=OutL=( 0  1  2 ... 13 14 15)
+       por     xmm5,xmm6               ; xmm5=OutH=(16 17 18 ... 29 30 31)
+
+       movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
+       movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
+
+       sub     rax, byte SIZEOF_XMMWORD
+       add     rsi, byte 1*SIZEOF_XMMWORD      ; inptr
+       add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
+       cmp     rax, byte SIZEOF_XMMWORD
+       ja      near .columnloop
+       test    eax,eax
+       jnz     near .columnloop_last
+
+       pop     rsi
+       pop     rdi
+       pop     rax
+
+       add     rsi, byte SIZEOF_JSAMPROW       ; input_data
+       add     rdi, byte SIZEOF_JSAMPROW       ; output_data
+       dec     rcx                             ; rowctr
+       jg      near .rowloop
+
+.return:
+       uncollect_args
+       pop     rbp
+       ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
+;                                 JDIMENSION downsampled_width,
+;                                 JSAMPARRAY input_data,
+;                                 JSAMPARRAY * output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11 = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY * output_data_ptr
+
+%define wk(i)          rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM         4
+
+       align   16
+       global  EXTN(jsimd_h2v2_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v2_fancy_upsample_sse2):
+       push    rbp
+       mov     rax,rsp                         ; rax = original rbp
+       sub     rsp, byte 4
+       and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+       mov     [rsp],rax
+       mov     rbp,rsp                         ; rbp = aligned rbp
+       lea     rsp, [wk(0)]
+       push    rbx
+       collect_args
+
+       mov     rax, r11  ; colctr
+       test    rax,rax
+       jz      near .return
+
+       mov     rcx, r10        ; rowctr
+       test    rcx,rcx
+       jz      near .return
+
+       mov     rsi, r12        ; input_data
+       mov     rdi, r13
+       mov     rdi, JSAMPARRAY [rdi]                   ; output_data
+.rowloop:
+       push    rax                                     ; colctr
+       push    rcx
+       push    rdi
+       push    rsi
+
+       mov     rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]   ; inptr1(above)
+       mov     rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; inptr0
+       mov     rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]   ; inptr1(below)
+       mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]   ; outptr0
+       mov     rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]   ; outptr1
+
+       test    rax, SIZEOF_XMMWORD-1
+       jz      short .skip
+       push    rdx
+       mov     dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
+       mov     JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
+       mov     dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
+       mov     JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
+       mov     dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+       mov     JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+       pop     rdx
+.skip:
+       ; -- process the first column block
+
+       movdqa  xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD]    ; xmm0=row[ 0][0]
+       movdqa  xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD]    ; xmm1=row[-1][0]
+       movdqa  xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD]    ; xmm2=row[+1][0]
+
+       pxor      xmm3,xmm3             ; xmm3=(all 0's)
+       movdqa    xmm4,xmm0
+       punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+       punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+       movdqa    xmm5,xmm1
+       punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+       punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+       movdqa    xmm6,xmm2
+       punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+       punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+       pmullw  xmm0,[PW_THREE]
+       pmullw  xmm4,[PW_THREE]
+
+       pcmpeqb xmm7,xmm7
+       psrldq  xmm7,(SIZEOF_XMMWORD-2)
+
+       paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+       paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+       paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+       paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+       movdqa  XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1    ; temporarily save
+       movdqa  XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5    ; the intermediate data
+       movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
+       movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
+
+       pand    xmm1,xmm7               ; xmm1=( 0 -- -- -- -- -- -- --)
+       pand    xmm2,xmm7               ; xmm2=( 0 -- -- -- -- -- -- --)
+
+       movdqa  XMMWORD [wk(0)], xmm1
+       movdqa  XMMWORD [wk(1)], xmm2
+
+       add     rax, byte SIZEOF_XMMWORD-1
+       and     rax, byte -SIZEOF_XMMWORD
+       cmp     rax, byte SIZEOF_XMMWORD
+       ja      short .columnloop
+
+.columnloop_last:
+       ; -- process the last column block
+
+       pcmpeqb xmm1,xmm1
+       pslldq  xmm1,(SIZEOF_XMMWORD-2)
+       movdqa  xmm2,xmm1
+
+       pand    xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
+       pand    xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
+
+       movdqa  XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
+       movdqa  XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
+
+       jmp     near .upsample
+
+.columnloop:
+       ; -- process the next column block
+
+       movdqa  xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD]    ; xmm0=row[ 0][1]
+       movdqa  xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD]    ; xmm1=row[-1][1]
+       movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]    ; xmm2=row[+1][1]
+
+       pxor      xmm3,xmm3             ; xmm3=(all 0's)
+       movdqa    xmm4,xmm0
+       punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+       punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+       movdqa    xmm5,xmm1
+       punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+       punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+       movdqa    xmm6,xmm2
+       punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+       punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+       pmullw  xmm0,[PW_THREE]
+       pmullw  xmm4,[PW_THREE]
+
+       paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+       paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+       paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+       paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+       movdqa  XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1    ; temporarily save
+       movdqa  XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5    ; the intermediate data
+       movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+       movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
+
+       pslldq  xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- --  0)
+       pslldq  xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- --  0)
+
+       movdqa  XMMWORD [wk(2)], xmm1
+       movdqa  XMMWORD [wk(3)], xmm2
+
+.upsample:
+       ; -- process the upper row
+
+       movdqa  xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+       movdqa  xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+
+       movdqa  xmm0,xmm7               ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
+       movdqa  xmm4,xmm3               ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
+       psrldq  xmm0,2                  ; xmm0=( 1  2  3  4  5  6  7 --)
+       pslldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- --  8)
+       movdqa  xmm5,xmm7
+       movdqa  xmm6,xmm3
+       psrldq  xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
+       pslldq  xmm6,2                  ; xmm6=(--  8  9 10 11 12 13 14)
+
+       por     xmm0,xmm4               ; xmm0=( 1  2  3  4  5  6  7  8)
+       por     xmm5,xmm6               ; xmm5=( 7  8  9 10 11 12 13 14)
+
+       movdqa  xmm1,xmm7
+       movdqa  xmm2,xmm3
+       pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
+       psrldq  xmm2,2                  ; xmm2=( 9 10 11 12 13 14 15 --)
+       movdqa  xmm4,xmm3
+       psrldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
+
+       por     xmm1, XMMWORD [wk(0)]   ; xmm1=(-1  0  1  2  3  4  5  6)
+       por     xmm2, XMMWORD [wk(2)]   ; xmm2=( 9 10 11 12 13 14 15 16)
+
+       movdqa  XMMWORD [wk(0)], xmm4
+
+       pmullw  xmm7,[PW_THREE]
+       pmullw  xmm3,[PW_THREE]
+       paddw   xmm1,[PW_EIGHT]
+       paddw   xmm5,[PW_EIGHT]
+       paddw   xmm0,[PW_SEVEN]
+       paddw   xmm2,[PW_SEVEN]
+
+       paddw   xmm1,xmm7
+       paddw   xmm5,xmm3
+       psrlw   xmm1,4                  ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
+       psrlw   xmm5,4                  ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
+       paddw   xmm0,xmm7
+       paddw   xmm2,xmm3
+       psrlw   xmm0,4                  ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
+       psrlw   xmm2,4                  ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
+
+       psllw   xmm0,BYTE_BIT
+       psllw   xmm2,BYTE_BIT
+       por     xmm1,xmm0               ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
+       por     xmm5,xmm2               ; xmm5=Out0H=(16 17 18 ... 29 30 31)
+
+       movdqa  XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
+       movdqa  XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
+
+       ; -- process the lower row
+
+       movdqa  xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
+       movdqa  xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
+
+       movdqa  xmm7,xmm6               ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
+       movdqa  xmm3,xmm4               ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
+       psrldq  xmm7,2                  ; xmm7=( 1  2  3  4  5  6  7 --)
+       pslldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- --  8)
+       movdqa  xmm0,xmm6
+       movdqa  xmm2,xmm4
+       psrldq  xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
+       pslldq  xmm2,2                  ; xmm2=(--  8  9 10 11 12 13 14)
+
+       por     xmm7,xmm3               ; xmm7=( 1  2  3  4  5  6  7  8)
+       por     xmm0,xmm2               ; xmm0=( 7  8  9 10 11 12 13 14)
+
+       movdqa  xmm1,xmm6
+       movdqa  xmm5,xmm4
+       pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
+       psrldq  xmm5,2                  ; xmm5=( 9 10 11 12 13 14 15 --)
+       movdqa  xmm3,xmm4
+       psrldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
+
+       por     xmm1, XMMWORD [wk(1)]   ; xmm1=(-1  0  1  2  3  4  5  6)
+       por     xmm5, XMMWORD [wk(3)]   ; xmm5=( 9 10 11 12 13 14 15 16)
+
+       movdqa  XMMWORD [wk(1)], xmm3
+
+       pmullw  xmm6,[PW_THREE]
+       pmullw  xmm4,[PW_THREE]
+       paddw   xmm1,[PW_EIGHT]
+       paddw   xmm0,[PW_EIGHT]
+       paddw   xmm7,[PW_SEVEN]
+       paddw   xmm5,[PW_SEVEN]
+
+       paddw   xmm1,xmm6
+       paddw   xmm0,xmm4
+       psrlw   xmm1,4                  ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
+       psrlw   xmm0,4                  ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
+       paddw   xmm7,xmm6
+       paddw   xmm5,xmm4
+       psrlw   xmm7,4                  ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
+       psrlw   xmm5,4                  ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
+
+       psllw   xmm7,BYTE_BIT
+       psllw   xmm5,BYTE_BIT
+       por     xmm1,xmm7               ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
+       por     xmm0,xmm5               ; xmm0=Out1H=(16 17 18 ... 29 30 31)
+
+       movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
+       movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
+
+       sub     rax, byte SIZEOF_XMMWORD
+       add     rcx, byte 1*SIZEOF_XMMWORD      ; inptr1(above)
+       add     rbx, byte 1*SIZEOF_XMMWORD      ; inptr0
+       add     rsi, byte 1*SIZEOF_XMMWORD      ; inptr1(below)
+       add     rdx, byte 2*SIZEOF_XMMWORD      ; outptr0
+       add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr1
+       cmp     rax, byte SIZEOF_XMMWORD
+       ja      near .columnloop
+       test    rax,rax
+       jnz     near .columnloop_last
+
+       pop     rsi
+       pop     rdi
+       pop     rcx
+       pop     rax
+
+       add     rsi, byte 1*SIZEOF_JSAMPROW     ; input_data
+       add     rdi, byte 2*SIZEOF_JSAMPROW     ; output_data
+       sub     rcx, byte 2                     ; rowctr
+       jg      near .rowloop
+
+.return:
+       uncollect_args
+       pop     rbx
+       mov     rsp,rbp         ; rsp <- aligned rbp
+       pop     rsp             ; rsp <- original rbp
+       pop     rbp
+       ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
+;                           JDIMENSION output_width,
+;                           JSAMPARRAY input_data,
+;                           JSAMPARRAY * output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11 = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY * output_data_ptr
+
+       align   16
+       global  EXTN(jsimd_h2v1_upsample_sse2)
+
+EXTN(jsimd_h2v1_upsample_sse2):
+       push    rbp
+       mov     rbp,rsp
+       collect_args
+
+       mov     rdx, r11
+       add     rdx, byte (2*SIZEOF_XMMWORD)-1
+       and     rdx, byte -(2*SIZEOF_XMMWORD)
+       jz      near .return
+
+       mov     rcx, r10        ; rowctr
+       test    rcx,rcx
+       jz      short .return
+
+       mov     rsi, r12 ; input_data
+       mov     rdi, r13
+       mov     rdi, JSAMPARRAY [rdi]                   ; output_data
+.rowloop:
+       push    rdi
+       push    rsi
+
+       mov     rsi, JSAMPROW [rsi]             ; inptr
+       mov     rdi, JSAMPROW [rdi]             ; outptr
+       mov     rax,rdx                         ; colctr
+.columnloop:
+
+       movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+       movdqa    xmm1,xmm0
+       punpcklbw xmm0,xmm0
+       punpckhbw xmm1,xmm1
+
+       movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+       movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+       sub     rax, byte 2*SIZEOF_XMMWORD
+       jz      short .nextrow
+
+       movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+       movdqa    xmm3,xmm2
+       punpcklbw xmm2,xmm2
+       punpckhbw xmm3,xmm3
+
+       movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+       movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
+
+       sub     rax, byte 2*SIZEOF_XMMWORD
+       jz      short .nextrow
+
+       add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
+       add     rdi, byte 4*SIZEOF_XMMWORD      ; outptr
+       jmp     short .columnloop
+
+.nextrow:
+       pop     rsi
+       pop     rdi
+
+       add     rsi, byte SIZEOF_JSAMPROW       ; input_data
+       add     rdi, byte SIZEOF_JSAMPROW       ; output_data
+       dec     rcx                             ; rowctr
+       jg      short .rowloop
+
+.return:
+       uncollect_args
+       pop     rbp
+       ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
+;                           JDIMENSION output_width,
+;                           JSAMPARRAY input_data,
+;                           JSAMPARRAY * output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11 = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY * output_data_ptr
+
+       align   16
+       global  EXTN(jsimd_h2v2_upsample_sse2)
+
+EXTN(jsimd_h2v2_upsample_sse2):
+       push    rbp
+       mov     rbp,rsp
+       push    rbx
+       collect_args
+
+       mov     rdx, r11
+       add     rdx, byte (2*SIZEOF_XMMWORD)-1
+       and     rdx, byte -(2*SIZEOF_XMMWORD)
+       jz      near .return
+
+       mov     rcx, r10        ; rowctr
+       test    rcx,rcx
+       jz      near .return
+
+       mov     rsi, r12        ; input_data
+       mov     rdi, r13
+       mov     rdi, JSAMPARRAY [rdi]                   ; output_data
+.rowloop:
+       push    rdi
+       push    rsi
+
+       mov     rsi, JSAMPROW [rsi]                     ; inptr
+       mov     rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]   ; outptr0
+       mov     rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]   ; outptr1
+       mov     rax,rdx                                 ; colctr
+.columnloop:
+
+       movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+       movdqa    xmm1,xmm0
+       punpcklbw xmm0,xmm0
+       punpckhbw xmm1,xmm1
+
+       movdqa  XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
+       movdqa  XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
+       movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+       movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+       sub     rax, byte 2*SIZEOF_XMMWORD
+       jz      short .nextrow
+
+       movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+       movdqa    xmm3,xmm2
+       punpcklbw xmm2,xmm2
+       punpckhbw xmm3,xmm3
+
+       movdqa  XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
+       movdqa  XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
+       movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+       movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
+
+       sub     rax, byte 2*SIZEOF_XMMWORD
+       jz      short .nextrow
+
+       add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
+       add     rbx, byte 4*SIZEOF_XMMWORD      ; outptr0
+       add     rdi, byte 4*SIZEOF_XMMWORD      ; outptr1
+       jmp     short .columnloop
+
+.nextrow:
+       pop     rsi
+       pop     rdi
+
+       add     rsi, byte 1*SIZEOF_JSAMPROW     ; input_data
+       add     rdi, byte 2*SIZEOF_JSAMPROW     ; output_data
+       sub     rcx, byte 2                     ; rowctr
+       jg      near .rowloop
+
+.return:
+       uncollect_args
+       pop     rbx
+       pop     rbp
+       ret
diff --git a/common/jpeg/simd/jfss2fst-64.asm b/common/jpeg/simd/jfss2fst-64.asm
new file mode 100644 (file)
index 0000000..9303156
--- /dev/null
@@ -0,0 +1,388 @@
+;
+; jfss2fst.asm - fast integer FDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS     8       ; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382        equ      98             ; FIX(0.382683433)
+F_0_541        equ     139             ; FIX(0.541196100)
+F_0_707        equ     181             ; FIX(0.707106781)
+F_1_306        equ     334             ; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_382        equ     DESCALE( 410903207,30-CONST_BITS)       ; FIX(0.382683433)
+F_0_541        equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
+F_0_707        equ     DESCALE( 759250124,30-CONST_BITS)       ; FIX(0.707106781)
+F_1_306        equ     DESCALE(1402911301,30-CONST_BITS)       ; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+       SECTION SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+       alignz  16
+       global  EXTN(jconst_fdct_ifast_sse2)
+
+EXTN(jconst_fdct_ifast_sse2):
+
+PW_F0707       times 8 dw  F_0_707 << CONST_SHIFT
+PW_F0382       times 8 dw  F_0_382 << CONST_SHIFT
+PW_F0541       times 8 dw  F_0_541 << CONST_SHIFT
+PW_F1306       times 8 dw  F_1_306 << CONST_SHIFT
+
+       alignz  16
+
+; --------------------------------------------------------------------------
+       SECTION SEG_TEXT
+       BITS    64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_sse2 (DCTELEM * data)
+;
+
+; r10 = DCTELEM * data
+
+%define wk(i)          rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+
+       align   16
+       global  EXTN(jsimd_fdct_ifast_sse2)
+
+EXTN(jsimd_fdct_ifast_sse2):
+       push    rbp
+       mov     rax,rsp                         ; rax = original rbp
+       sub     rsp, byte 4
+       and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+       mov     [rsp],rax
+       mov     rbp,rsp                         ; rbp = aligned rbp
+       lea     rsp, [wk(0)]
+       collect_args
+
+       ; ---- Pass 1: process rows.
+
+       mov     rdx, r10        ; (DCTELEM *)
+
+       movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
+       movdqa  xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
+       movdqa  xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
+       movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
+
+       ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+       ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+       movdqa    xmm4,xmm0             ; transpose coefficients(phase 1)
+       punpcklwd xmm0,xmm1             ; xmm0=(00 10 01 11 02 12 03 13)
+       punpckhwd xmm4,xmm1             ; xmm4=(04 14 05 15 06 16 07 17)
+       movdqa    xmm5,xmm2             ; transpose coefficients(phase 1)
+       punpcklwd xmm2,xmm3             ; xmm2=(20 30 21 31 22 32 23 33)
+       punpckhwd xmm5,xmm3             ; xmm5=(24 34 25 35 26 36 27 37)
+
+       movdqa  xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
+       movdqa  xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
+       movdqa  xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
+       movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
+
+       ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+       ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+       movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
+       movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
+
+       movdqa    xmm2,xmm6             ; transpose coefficients(phase 1)
+       punpcklwd xmm6,xmm7             ; xmm6=(40 50 41 51 42 52 43 53)
+       punpckhwd xmm2,xmm7             ; xmm2=(44 54 45 55 46 56 47 57)
+       movdqa    xmm5,xmm1             ; transpose coefficients(phase 1)
+       punpcklwd xmm1,xmm3             ; xmm1=(60 70 61 71 62 72 63 73)
+       punpckhwd xmm5,xmm3             ; xmm5=(64 74 65 75 66 76 67 77)
+
+       movdqa    xmm7,xmm6             ; transpose coefficients(phase 2)
+       punpckldq xmm6,xmm1             ; xmm6=(40 50 60 70 41 51 61 71)
+       punpckhdq xmm7,xmm1             ; xmm7=(42 52 62 72 43 53 63 73)
+       movdqa    xmm3,xmm2             ; transpose coefficients(phase 2)
+       punpckldq xmm2,xmm5             ; xmm2=(44 54 64 74 45 55 65 75)
+       punpckhdq xmm3,xmm5             ; xmm3=(46 56 66 76 47 57 67 77)
+
+       movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
+       movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
+       movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=(42 52 62 72 43 53 63 73)
+       movdqa  XMMWORD [wk(1)], xmm2   ; wk(1)=(44 54 64 74 45 55 65 75)
+
+       movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
+       punpckldq xmm0,xmm1             ; xmm0=(00 10 20 30 01 11 21 31)
+       punpckhdq xmm7,xmm1             ; xmm7=(02 12 22 32 03 13 23 33)
+       movdqa    xmm2,xmm4             ; transpose coefficients(phase 2)
+       punpckldq xmm4,xmm5             ; xmm4=(04 14 24 34 05 15 25 35)
+       punpckhdq xmm2,xmm5             ; xmm2=(06 16 26 36 07 17 27 37)
+
+       movdqa     xmm1,xmm0            ; transpose coefficients(phase 3)
+       punpcklqdq xmm0,xmm6            ; xmm0=(00 10 20 30 40 50 60 70)=data0
+       punpckhqdq xmm1,xmm6            ; xmm1=(01 11 21 31 41 51 61 71)=data1
+       movdqa     xmm5,xmm2            ; transpose coefficients(phase 3)
+       punpcklqdq xmm2,xmm3            ; xmm2=(06 16 26 36 46 56 66 76)=data6
+       punpckhqdq xmm5,xmm3            ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+       movdqa  xmm6,xmm1
+       movdqa  xmm3,xmm0
+       psubw   xmm1,xmm2               ; xmm1=data1-data6=tmp6
+       psubw   xmm0,xmm5               ; xmm0=data0-data7=tmp7
+       paddw   xmm6,xmm2               ; xmm6=data1+data6=tmp1
+       paddw   xmm3,xmm5               ; xmm3=data0+data7=tmp0
+
+       movdqa  xmm2, XMMWORD [wk(0)]   ; xmm2=(42 52 62 72 43 53 63 73)
+       movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(44 54 64 74 45 55 65 75)
+       movdqa  XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
+       movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
+
+       movdqa     xmm1,xmm7            ; transpose coefficients(phase 3)
+       punpcklqdq xmm7,xmm2            ; xmm7=(02 12 22 32 42 52 62 72)=data2
+       punpckhqdq xmm1,xmm2            ; xmm1=(03 13 23 33 43 53 63 73)=data3
+       movdqa     xmm0,xmm4            ; transpose coefficients(phase 3)
+       punpcklqdq xmm4,xmm5            ; xmm4=(04 14 24 34 44 54 64 74)=data4
+       punpckhqdq xmm0,xmm5            ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+       movdqa  xmm2,xmm1
+       movdqa  xmm5,xmm7
+       paddw   xmm1,xmm4               ; xmm1=data3+data4=tmp3
+       paddw   xmm7,xmm0               ; xmm7=data2+data5=tmp2
+       psubw   xmm2,xmm4               ; xmm2=data3-data4=tmp4
+       psubw   xmm5,xmm0               ; xmm5=data2-data5=tmp5
+
+       ; -- Even part
+
+       movdqa  xmm4,xmm3
+       movdqa  xmm0,xmm6
+       psubw   xmm3,xmm1               ; xmm3=tmp13
+       psubw   xmm6,xmm7               ; xmm6=tmp12
+       paddw   xmm4,xmm1               ; xmm4=tmp10
+       paddw   xmm0,xmm7               ; xmm0=tmp11
+
+       paddw   xmm6,xmm3
+       psllw   xmm6,PRE_MULTIPLY_SCALE_BITS
+       pmulhw  xmm6,[PW_F0707] ; xmm6=z1
+
+       movdqa  xmm1,xmm4
+       movdqa  xmm7,xmm3
+       psubw   xmm4,xmm0               ; xmm4=data4
+       psubw   xmm3,xmm6               ; xmm3=data6
+       paddw   xmm1,xmm0               ; xmm1=data0
+       paddw   xmm7,xmm6               ; xmm7=data2
+
+       movdqa  xmm0, XMMWORD [wk(0)]   ; xmm0=tmp6
+       movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=tmp7
+       movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=data4
+       movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=data6
+
+       ; -- Odd part
+
+       paddw   xmm2,xmm5               ; xmm2=tmp10
+       paddw   xmm5,xmm0               ; xmm5=tmp11
+       paddw   xmm0,xmm6               ; xmm0=tmp12, xmm6=tmp7
+
+       psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
+       psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
+
+       psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
+       pmulhw  xmm5,[PW_F0707] ; xmm5=z3
+
+       movdqa  xmm4,xmm2               ; xmm4=tmp10
+       psubw   xmm2,xmm0
+       pmulhw  xmm2,[PW_F0382] ; xmm2=z5
+       pmulhw  xmm4,[PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+       pmulhw  xmm0,[PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
+       paddw   xmm4,xmm2               ; xmm4=z2
+       paddw   xmm0,xmm2               ; xmm0=z4
+
+       movdqa  xmm3,xmm6
+       psubw   xmm6,xmm5               ; xmm6=z13
+       paddw   xmm3,xmm5               ; xmm3=z11
+
+       movdqa  xmm2,xmm6
+       movdqa  xmm5,xmm3
+       psubw   xmm6,xmm4               ; xmm6=data3
+       psubw   xmm3,xmm0               ; xmm3=data7
+       paddw   xmm2,xmm4               ; xmm2=data5
+       paddw   xmm5,xmm0               ; xmm5=data1
+
+       ; ---- Pass 2: process columns.
+
+       ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
+       ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
+
+       movdqa    xmm4,xmm1             ; transpose coefficients(phase 1)
+       punpcklwd xmm1,xmm5             ; xmm1=(00 01 10 11 20 21 30 31)
+       punpckhwd xmm4,xmm5             ; xmm4=(40 41 50 51 60 61 70 71)
+       movdqa    xmm0,xmm7             ; transpose coefficients(phase 1)
+       punpcklwd xmm7,xmm6             ; xmm7=(02 03 12 13 22 23 32 33)
+       punpckhwd xmm0,xmm6             ; xmm0=(42 43 52 53 62 63 72 73)
+
+       movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=col4
+       movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=col6
+
+       ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
+       ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
+
+       movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=(02 03 12 13 22 23 32 33)
+       movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(42 43 52 53 62 63 72 73)
+
+       movdqa    xmm7,xmm5             ; transpose coefficients(phase 1)
+       punpcklwd xmm5,xmm2             ; xmm5=(04 05 14 15 24 25 34 35)
+       punpckhwd xmm7,xmm2             ; xmm7=(44 45 54 55 64 65 74 75)
+       movdqa    xmm0,xmm6             ; transpose coefficients(phase 1)
+       punpcklwd xmm6,xmm3             ; xmm6=(06 07 16 17 26 27 36 37)
+       punpckhwd xmm0,xmm3             ; xmm0=(46 47 56 57 66 67 76 77)
+
+       movdqa    xmm2,xmm5             ; transpose coefficients(phase 2)
+       punpckldq xmm5,xmm6             ; xmm5=(04 05 06 07 14 15 16 17)
+       punpckhdq xmm2,xmm6             ; xmm2=(24 25 26 27 34 35 36 37)
+       movdqa    xmm3,xmm7             ; transpose coefficients(phase 2)
+       punpckldq xmm7,xmm0             ; xmm7=(44 45 46 47 54 55 56 57)
+       punpckhdq xmm3,xmm0             ; xmm3=(64 65 66 67 74 75 76 77)
+
+       movdqa  xmm6, XMMWORD [wk(0)]   ; xmm6=(02 03 12 13 22 23 32 33)
+       movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(42 43 52 53 62 63 72 73)
+       movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(24 25 26 27 34 35 36 37)
+       movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=(44 45 46 47 54 55 56 57)
+
+       movdqa    xmm2,xmm1             ; transpose coefficients(phase 2)
+       punpckldq xmm1,xmm6             ; xmm1=(00 01 02 03 10 11 12 13)
+       punpckhdq xmm2,xmm6             ; xmm2=(20 21 22 23 30 31 32 33)
+       movdqa    xmm7,xmm4             ; transpose coefficients(phase 2)
+       punpckldq xmm4,xmm0             ; xmm4=(40 41 42 43 50 51 52 53)
+       punpckhdq xmm7,xmm0             ; xmm7=(60 61 62 63 70 71 72 73)
+
+       movdqa     xmm6,xmm1            ; transpose coefficients(phase 3)
+       punpcklqdq xmm1,xmm5            ; xmm1=(00 01 02 03 04 05 06 07)=data0
+       punpckhqdq xmm6,xmm5            ; xmm6=(10 11 12 13 14 15 16 17)=data1
+       movdqa     xmm0,xmm7            ; transpose coefficients(phase 3)
+       punpcklqdq xmm7,xmm3            ; xmm7=(60 61 62 63 64 65 66 67)=data6
+       punpckhqdq xmm0,xmm3            ; xmm0=(70 71 72 73 74 75 76 77)=data7
+
+       movdqa  xmm5,xmm6
+       movdqa  xmm3,xmm1
+       psubw   xmm6,xmm7               ; xmm6=data1-data6=tmp6
+       psubw   xmm1,xmm0               ; xmm1=data0-data7=tmp7
+       paddw   xmm5,xmm7               ; xmm5=data1+data6=tmp1
+       paddw   xmm3,xmm0               ; xmm3=data0+data7=tmp0
+
+       movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=(24 25 26 27 34 35 36 37)
+       movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(44 45 46 47 54 55 56 57)
+       movdqa  XMMWORD [wk(0)], xmm6   ; wk(0)=tmp6
+       movdqa  XMMWORD [wk(1)], xmm1   ; wk(1)=tmp7
+
+       movdqa     xmm6,xmm2            ; transpose coefficients(phase 3)
+       punpcklqdq xmm2,xmm7            ; xmm2=(20 21 22 23 24 25 26 27)=data2
+       punpckhqdq xmm6,xmm7            ; xmm6=(30 31 32 33 34 35 36 37)=data3
+       movdqa     xmm1,xmm4            ; transpose coefficients(phase 3)
+       punpcklqdq xmm4,xmm0            ; xmm4=(40 41 42 43 44 45 46 47)=data4
+       punpckhqdq xmm1,xmm0            ; xmm1=(50 51 52 53 54 55 56 57)=data5
+
+       movdqa  xmm7,xmm6
+       movdqa  xmm0,xmm2
+       paddw   xmm6,xmm4               ; xmm6=data3+data4=tmp3
+       paddw   xmm2,xmm1               ; xmm2=data2+data5=tmp2
+       psubw   xmm7,xmm4               ; xmm7=data3-data4=tmp4
+       psubw   xmm0,xmm1               ; xmm0=data2-data5=tmp5
+
+       ; -- Even part
+
+       movdqa  xmm4,xmm3
+       movdqa  xmm1,xmm5
+       psubw   xmm3,xmm6               ; xmm3=tmp13
+       psubw   xmm5,xmm2               ; xmm5=tmp12
+       paddw   xmm4,xmm6               ; xmm4=tmp10
+       paddw   xmm1,xmm2               ; xmm1=tmp11
+
+       paddw   xmm5,xmm3
+       psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
+       pmulhw  xmm5,[PW_F0707] ; xmm5=z1
+
+       movdqa  xmm6,xmm4
+       movdqa  xmm2,xmm3
+       psubw   xmm4,xmm1               ; xmm4=data4
+       psubw   xmm3,xmm5               ; xmm3=data6
+       paddw   xmm6,xmm1               ; xmm6=data0
+       paddw   xmm2,xmm5               ; xmm2=data2
+
+       movdqa  XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
+       movdqa  XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
+       movdqa  XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6
+       movdqa  XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2
+
+       ; -- Odd part
+
+       movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp6
+       movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
+
+       paddw   xmm7,xmm0               ; xmm7=tmp10
+       paddw   xmm0,xmm1               ; xmm0=tmp11
+       paddw   xmm1,xmm5               ; xmm1=tmp12, xmm5=tmp7
+
+       psllw   xmm7,PRE_MULTIPLY_SCALE_BITS
+       psllw   xmm1,PRE_MULTIPLY_SCALE_BITS
+
+       psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
+       pmulhw  xmm0,[PW_F0707] ; xmm0=z3
+
+       movdqa  xmm4,xmm7               ; xmm4=tmp10
+       psubw   xmm7,xmm1
+       pmulhw  xmm7,[PW_F0382] ; xmm7=z5
+       pmulhw  xmm4,[PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+       pmulhw  xmm1,[PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
+       paddw   xmm4,xmm7               ; xmm4=z2
+       paddw   xmm1,xmm7               ; xmm1=z4
+
+       movdqa  xmm3,xmm5
+       psubw   xmm5,xmm0               ; xmm5=z13
+       paddw   xmm3,xmm0               ; xmm3=z11
+
+       movdqa  xmm6,xmm5
+       movdqa  xmm2,xmm3
+       psubw   xmm5,xmm4               ; xmm5=data3
+       psubw   xmm3,xmm1               ; xmm3=data7
+       paddw   xmm6,xmm4               ; xmm6=data5
+       paddw   xmm2,xmm1               ; xmm2=data1
+
+       movdqa  XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
+       movdqa  XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
+       movdqa  XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
+       movdqa  XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
+
+       uncollect_args
+       mov     rsp,rbp         ; rsp <- aligned rbp
+       pop     rsp             ; rsp <- original rbp
+       pop     rbp
+       ret
diff --git a/common/jpeg/simd/jfss2int-64.asm b/common/jpeg/simd/jfss2int-64.asm
new file mode 100644 (file)
index 0000000..f787921
--- /dev/null
@@ -0,0 +1,618 @@
+;
+; jfss2int.asm - accurate integer FDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS     13
+%define PASS1_BITS     2
+
+%define DESCALE_P1     (CONST_BITS-PASS1_BITS)
+%define DESCALE_P2     (CONST_BITS+PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298        equ      2446           ; FIX(0.298631336)
+F_0_390        equ      3196           ; FIX(0.390180644)
+F_0_541        equ      4433           ; FIX(0.541196100)
+F_0_765        equ      6270           ; FIX(0.765366865)
+F_0_899        equ      7373           ; FIX(0.899976223)
+F_1_175        equ      9633           ; FIX(1.175875602)
+F_1_501        equ     12299           ; FIX(1.501321110)
+F_1_847        equ     15137           ; FIX(1.847759065)
+F_1_961        equ     16069           ; FIX(1.961570560)
+F_2_053        equ     16819           ; FIX(2.053119869)
+F_2_562        equ     20995           ; FIX(2.562915447)
+F_3_072        equ     25172           ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298        equ     DESCALE( 320652955,30-CONST_BITS)       ; FIX(0.298631336)
+F_0_390        equ     DESCALE( 418953276,30-CONST_BITS)       ; FIX(0.390180644)
+F_0_541        equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
+F_0_765        equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
+F_0_899        equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
+F_1_175        equ     DESCALE(1262586813,30-CONST_BITS)       ; FIX(1.175875602)
+F_1_501        equ     DESCALE(1612031267,30-CONST_BITS)       ; FIX(1.501321110)
+F_1_847        equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
+F_1_961        equ     DESCALE(2106220350,30-CONST_BITS)       ; FIX(1.961570560)
+F_2_053        equ     DESCALE(2204520673,30-CONST_BITS)       ; FIX(2.053119869)
+F_2_562        equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
+F_3_072        equ     DESCALE(3299298341,30-CONST_BITS)       ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+       SECTION SEG_CONST
+
+       alignz  16
+       global  EXTN(jconst_fdct_islow_sse2)
+
+EXTN(jconst_fdct_islow_sse2):
+
+PW_F130_F054   times 4 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130  times 4 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117  times 4 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078   times 4 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089 times 4 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060  times 4 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256 times 4 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050  times 4 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1  times 4 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2  times 4 dd  1 << (DESCALE_P2-1)
+PW_DESCALE_P2X times 8 dw  1 << (PASS1_BITS-1)
+
+       alignz  16
+
+; --------------------------------------------------------------------------
+       SECTION SEG_TEXT
+       BITS    64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_sse2 (DCTELEM * data)
+;
+
+; r10 = DCTELEM * data
+
+%define wk(i)          rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM         6
+
+       align   16
+       global  EXTN(jsimd_fdct_islow_sse2)
+
+EXTN(jsimd_fdct_islow_sse2):
+       push    rbp
+       mov     rax,rsp                         ; rax = original rbp
+       sub     rsp, byte 4
+       and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+       mov     [rsp],rax
+       mov     rbp,rsp                         ; rbp = aligned rbp
+       lea     rsp, [wk(0)]
+       collect_args
+
+       ; ---- Pass 1: process rows.
+
+       mov     rdx, r10        ; (DCTELEM *)
+
+       movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
+       movdqa  xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
+       movdqa  xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
+       movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
+
+       ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+       ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+       movdqa    xmm4,xmm0             ; transpose coefficients(phase 1)
+       punpcklwd xmm0,xmm1             ; xmm0=(00 10 01 11 02 12 03 13)
+       punpckhwd xmm4,xmm1             ; xmm4=(04 14 05 15 06 16 07 17)
+       movdqa    xmm5,xmm2             ; transpose coefficients(phase 1)
+       punpcklwd xmm2,xmm3             ; xmm2=(20 30 21 31 22 32 23 33)
+       punpckhwd xmm5,xmm3             ; xmm5=(24 34 25 35 26 36 27 37)
+
+       movdqa  xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
+       movdqa  xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
+       movdqa  xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
+       movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
+
+       ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+       ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+       movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
+       movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
+
+       movdqa    xmm2,xmm6             ; transpose coefficients(phase 1)
+       punpcklwd xmm6,xmm7             ; xmm6=(40 50 41 51 42 52 43 53)
+       punpckhwd xmm2,xmm7             ; xmm2=(44 54 45 55 46 56 47 57)
+       movdqa    xmm5,xmm1             ; transpose coefficients(phase 1)
+       punpcklwd xmm1,xmm3             ; xmm1=(60 70 61 71 62 72 63 73)
+       punpckhwd xmm5,xmm3             ; xmm5=(64 74 65 75 66 76 67 77)
+
+       movdqa    xmm7,xmm6             ; transpose coefficients(phase 2)
+       punpckldq xmm6,xmm1             ; xmm6=(40 50 60 70 41 51 61 71)
+       punpckhdq xmm7,xmm1             ; xmm7=(42 52 62 72 43 53 63 73)
+       movdqa    xmm3,xmm2             ; transpose coefficients(phase 2)
+       punpckldq xmm2,xmm5             ; xmm2=(44 54 64 74 45 55 65 75)
+       punpckhdq xmm3,xmm5             ; xmm3=(46 56 66 76 47 57 67 77)
+
+       movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
+       movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
+       movdqa  XMMWORD [wk(2)], xmm7   ; wk(2)=(42 52 62 72 43 53 63 73)
+       movdqa  XMMWORD [wk(3)], xmm2   ; wk(3)=(44 54 64 74 45 55 65 75)
+
+       movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
+       punpckldq xmm0,xmm1             ; xmm0=(00 10 20 30 01 11 21 31)
+       punpckhdq xmm7,xmm1             ; xmm7=(02 12 22 32 03 13 23 33)
+       movdqa    xmm2,xmm4             ; transpose coefficients(phase 2)
+       punpckldq xmm4,xmm5             ; xmm4=(04 14 24 34 05 15 25 35)
+       punpckhdq xmm2,xmm5             ; xmm2=(06 16 26 36 07 17 27 37)
+
+       movdqa     xmm1,xmm0            ; transpose coefficients(phase 3)
+       punpcklqdq xmm0,xmm6            ; xmm0=(00 10 20 30 40 50 60 70)=data0
+       punpckhqdq xmm1,xmm6            ; xmm1=(01 11 21 31 41 51 61 71)=data1
+       movdqa     xmm5,xmm2            ; transpose coefficients(phase 3)
+       punpcklqdq xmm2,xmm3            ; xmm2=(06 16 26 36 46 56 66 76)=data6
+       punpckhqdq xmm5,xmm3            ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+       movdqa  xmm6,xmm1
+       movdqa  xmm3,xmm0
+       psubw   xmm1,xmm2               ; xmm1=data1-data6=tmp6
+       psubw   xmm0,xmm5               ; xmm0=data0-data7=tmp7
+       paddw   xmm6,xmm2               ; xmm6=data1+data6=tmp1
+       paddw   xmm3,xmm5               ; xmm3=data0+data7=tmp0
+
+       movdqa  xmm2, XMMWORD [wk(2)]   ; xmm2=(42 52 62 72 43 53 63 73)
+       movdqa  xmm5, XMMWORD [wk(3)]   ; xmm5=(44 54 64 74 45 55 65 75)
+       movdqa  XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
+       movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
+
+       movdqa     xmm1,xmm7            ; transpose coefficients(phase 3)
+       punpcklqdq xmm7,xmm2            ; xmm7=(02 12 22 32 42 52 62 72)=data2
+       punpckhqdq xmm1,xmm2            ; xmm1=(03 13 23 33 43 53 63 73)=data3
+       movdqa     xmm0,xmm4            ; transpose coefficients(phase 3)
+       punpcklqdq xmm4,xmm5            ; xmm4=(04 14 24 34 44 54 64 74)=data4
+       punpckhqdq xmm0,xmm5            ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+       movdqa  xmm2,xmm1
+       movdqa  xmm5,xmm7
+       paddw   xmm1,xmm4               ; xmm1=data3+data4=tmp3
+       paddw   xmm7,xmm0               ; xmm7=data2+data5=tmp2
+       psubw   xmm2,xmm4               ; xmm2=data3-data4=tmp4
+       psubw   xmm5,xmm0               ; xmm5=data2-data5=tmp5
+
+       ; -- Even part
+
+       movdqa  xmm4,xmm3
+       movdqa  xmm0,xmm6
+       paddw   xmm3,xmm1               ; xmm3=tmp10
+       paddw   xmm6,xmm7               ; xmm6=tmp11
+       psubw   xmm4,xmm1               ; xmm4=tmp13
+       psubw   xmm0,xmm7               ; xmm0=tmp12
+
+       movdqa  xmm1,xmm3
+       paddw   xmm3,xmm6               ; xmm3=tmp10+tmp11
+       psubw   xmm1,xmm6               ; xmm1=tmp10-tmp11
+
+       psllw   xmm3,PASS1_BITS         ; xmm3=data0
+       psllw   xmm1,PASS1_BITS         ; xmm1=data4
+
+       movdqa  XMMWORD [wk(2)], xmm3   ; wk(2)=data0
+       movdqa  XMMWORD [wk(3)], xmm1   ; wk(3)=data4
+
+       ; (Original)
+       ; z1 = (tmp12 + tmp13) * 0.541196100;
+       ; data2 = z1 + tmp13 * 0.765366865;
+       ; data6 = z1 + tmp12 * -1.847759065;
+       ;
+       ; (This implementation)
+       ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+       ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+       movdqa    xmm7,xmm4             ; xmm4=tmp13
+       movdqa    xmm6,xmm4
+       punpcklwd xmm7,xmm0             ; xmm0=tmp12
+       punpckhwd xmm6,xmm0
+       movdqa    xmm4,xmm7
+       movdqa    xmm0,xmm6
+       pmaddwd   xmm7,[PW_F130_F054]   ; xmm7=data2L
+       pmaddwd   xmm6,[PW_F130_F054]   ; xmm6=data2H
+       pmaddwd   xmm4,[PW_F054_MF130]  ; xmm4=data6L
+       pmaddwd   xmm0,[PW_F054_MF130]  ; xmm0=data6H
+
+       paddd   xmm7,[PD_DESCALE_P1]
+       paddd   xmm6,[PD_DESCALE_P1]
+       psrad   xmm7,DESCALE_P1
+       psrad   xmm6,DESCALE_P1
+       paddd   xmm4,[PD_DESCALE_P1]
+       paddd   xmm0,[PD_DESCALE_P1]
+       psrad   xmm4,DESCALE_P1
+       psrad   xmm0,DESCALE_P1
+
+       packssdw  xmm7,xmm6             ; xmm7=data2
+       packssdw  xmm4,xmm0             ; xmm4=data6
+
+       movdqa  XMMWORD [wk(4)], xmm7   ; wk(4)=data2
+       movdqa  XMMWORD [wk(5)], xmm4   ; wk(5)=data6
+
+       ; -- Odd part
+
+       movdqa  xmm3, XMMWORD [wk(0)]   ; xmm3=tmp6
+       movdqa  xmm1, XMMWORD [wk(1)]   ; xmm1=tmp7
+
+       movdqa  xmm6,xmm2               ; xmm2=tmp4
+       movdqa  xmm0,xmm5               ; xmm5=tmp5
+       paddw   xmm6,xmm3               ; xmm6=z3
+       paddw   xmm0,xmm1               ; xmm0=z4
+
+       ; (Original)
+       ; z5 = (z3 + z4) * 1.175875602;
+       ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+       ; z3 += z5;  z4 += z5;
+       ;
+       ; (This implementation)
+       ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+       ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+       movdqa    xmm7,xmm6
+       movdqa    xmm4,xmm6
+       punpcklwd xmm7,xmm0
+       punpckhwd xmm4,xmm0
+       movdqa    xmm6,xmm7
+       movdqa    xmm0,xmm4
+       pmaddwd   xmm7,[PW_MF078_F117]  ; xmm7=z3L
+       pmaddwd   xmm4,[PW_MF078_F117]  ; xmm4=z3H
+       pmaddwd   xmm6,[PW_F117_F078]   ; xmm6=z4L
+       pmaddwd   xmm0,[PW_F117_F078]   ; xmm0=z4H
+
+       movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=z3L
+       movdqa  XMMWORD [wk(1)], xmm4   ; wk(1)=z3H
+
+       ; (Original)
+       ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+       ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+       ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+       ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+       ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+       ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+       ;
+       ; (This implementation)
+       ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+       ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+       ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+       ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+       ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+       ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+       movdqa    xmm7,xmm2
+       movdqa    xmm4,xmm2
+       punpcklwd xmm7,xmm1
+       punpckhwd xmm4,xmm1
+       movdqa    xmm2,xmm7
+       movdqa    xmm1,xmm4
+       pmaddwd   xmm7,[PW_MF060_MF089] ; xmm7=tmp4L
+       pmaddwd   xmm4,[PW_MF060_MF089] ; xmm4=tmp4H
+       pmaddwd   xmm2,[PW_MF089_F060]  ; xmm2=tmp7L
+       pmaddwd   xmm1,[PW_MF089_F060]  ; xmm1=tmp7H
+
+       paddd   xmm7, XMMWORD [wk(0)]   ; xmm7=data7L
+       paddd   xmm4, XMMWORD [wk(1)]   ; xmm4=data7H
+       paddd   xmm2,xmm6               ; xmm2=data1L
+       paddd   xmm1,xmm0               ; xmm1=data1H
+
+       paddd   xmm7,[PD_DESCALE_P1]
+       paddd   xmm4,[PD_DESCALE_P1]
+       psrad   xmm7,DESCALE_P1
+       psrad   xmm4,DESCALE_P1
+       paddd   xmm2,[PD_DESCALE_P1]
+       paddd   xmm1,[PD_DESCALE_P1]
+       psrad   xmm2,DESCALE_P1
+       psrad   xmm1,DESCALE_P1
+
+       packssdw  xmm7,xmm4             ; xmm7=data7
+       packssdw  xmm2,xmm1             ; xmm2=data1
+
+       movdqa    xmm4,xmm5
+       movdqa    xmm1,xmm5
+       punpcklwd xmm4,xmm3
+       punpckhwd xmm1,xmm3
+       movdqa    xmm5,xmm4
+       movdqa    xmm3,xmm1
+       pmaddwd   xmm4,[PW_MF050_MF256] ; xmm4=tmp5L
+       pmaddwd   xmm1,[PW_MF050_MF256] ; xmm1=tmp5H
+       pmaddwd   xmm5,[PW_MF256_F050]  ; xmm5=tmp6L
+       pmaddwd   xmm3,[PW_MF256_F050]  ; xmm3=tmp6H
+
+       paddd   xmm4,xmm6               ; xmm4=data5L
+       paddd   xmm1,xmm0               ; xmm1=data5H
+       paddd   xmm5, XMMWORD [wk(0)]   ; xmm5=data3L
+       paddd   xmm3, XMMWORD [wk(1)]   ; xmm3=data3H
+
+       paddd   xmm4,[PD_DESCALE_P1]
+       paddd   xmm1,[PD_DESCALE_P1]
+       psrad   xmm4,DESCALE_P1
+       psrad   xmm1,DESCALE_P1
+       paddd   xmm5,[PD_DESCALE_P1]
+       paddd   xmm3,[PD_DESCALE_P1]
+       psrad   xmm5,DESCALE_P1
+       psrad   xmm3,DESCALE_P1
+
+       packssdw  xmm4,xmm1             ; xmm4=data5
+       packssdw  xmm5,xmm3             ; xmm5=data3
+
+       ; ---- Pass 2: process columns.
+
+       movdqa  xmm6, XMMWORD [wk(2)]   ; xmm6=col0
+       movdqa  xmm0, XMMWORD [wk(4)]   ; xmm0=col2
+
+       ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
+       ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
+
+       movdqa    xmm1,xmm6             ; transpose coefficients(phase 1)
+       punpcklwd xmm6,xmm2             ; xmm6=(00 01 10 11 20 21 30 31)
+       punpckhwd xmm1,xmm2             ; xmm1=(40 41 50 51 60 61 70 71)
+       movdqa    xmm3,xmm0             ; transpose coefficients(phase 1)
+       punpcklwd xmm0,xmm5             ; xmm0=(02 03 12 13 22 23 32 33)
+       punpckhwd xmm3,xmm5             ; xmm3=(42 43 52 53 62 63 72 73)
+
+       movdqa  xmm2, XMMWORD [wk(3)]   ; xmm2=col4
+       movdqa  xmm5, XMMWORD [wk(5)]   ; xmm5=col6
+
+       ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
+       ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
+
+       movdqa  XMMWORD [wk(0)], xmm0   ; wk(0)=(02 03 12 13 22 23 32 33)
+       movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=(42 43 52 53 62 63 72 73)
+
+       movdqa    xmm0,xmm2             ; transpose coefficients(phase 1)
+       punpcklwd xmm2,xmm4             ; xmm2=(04 05 14 15 24 25 34 35)
+       punpckhwd xmm0,xmm4             ; xmm0=(44 45 54 55 64 65 74 75)
+       movdqa    xmm3,xmm5             ; transpose coefficients(phase 1)
+       punpcklwd xmm5,xmm7             ; xmm5=(06 07 16 17 26 27 36 37)
+       punpckhwd xmm3,xmm7             ; xmm3=(46 47 56 57 66 67 76 77)
+
+       movdqa    xmm4,xmm2             ; transpose coefficients(phase 2)
+       punpckldq xmm2,xmm5             ; xmm2=(04 05 06 07 14 15 16 17)
+       punpckhdq xmm4,xmm5             ; xmm4=(24 25 26 27 34 35 36 37)
+       movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
+       punpckldq xmm0,xmm3             ; xmm0=(44 45 46 47 54 55 56 57)
+       punpckhdq xmm7,xmm3             ; xmm7=(64 65 66 67 74 75 76 77)
+
+       movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=(02 03 12 13 22 23 32 33)
+       movdqa  xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53 62 63 72 73)
+       movdqa  XMMWORD [wk(2)], xmm4   ; wk(2)=(24 25 26 27 34 35 36 37)
+       movdqa  XMMWORD [wk(3)], xmm0   ; wk(3)=(44 45 46 47 54 55 56 57)
+
+       movdqa    xmm4,xmm6             ; transpose coefficients(phase 2)
+       punpckldq xmm6,xmm5             ; xmm6=(00 01 02 03 10 11 12 13)
+       punpckhdq xmm4,xmm5             ; xmm4=(20 21 22 23 30 31 32 33)
+       movdqa    xmm0,xmm1             ; transpose coefficients(phase 2)
+       punpckldq xmm1,xmm3             ; xmm1=(40 41 42 43 50 51 52 53)
+       punpckhdq xmm0,xmm3             ; xmm0=(60 61 62 63 70 71 72 73)
+
+       movdqa     xmm5,xmm6            ; transpose coefficients(phase 3)
+       punpcklqdq xmm6,xmm2            ; xmm6=(00 01 02 03 04 05 06 07)=data0
+       punpckhqdq xmm5,xmm2            ; xmm5=(10 11 12 13 14 15 16 17)=data1
+       movdqa     xmm3,xmm0            ; transpose coefficients(phase 3)
+       punpcklqdq xmm0,xmm7            ; xmm0=(60 61 62 63 64 65 66 67)=data6
+       punpckhqdq xmm3,xmm7            ; xmm3=(70 71 72 73 74 75 76 77)=data7
+
+       movdqa  xmm2,xmm5
+       movdqa  xmm7,xmm6
+       psubw   xmm5,xmm0               ; xmm5=data1-data6=tmp6
+       psubw   xmm6,xmm3               ; xmm6=data0-data7=tmp7
+       paddw   xmm2,xmm0               ; xmm2=data1+data6=tmp1
+       paddw   xmm7,xmm3               ; xmm7=data0+data7=tmp0
+
+       movdqa  xmm0, XMMWORD [wk(2)]   ; xmm0=(24 25 26 27 34 35 36 37)
+       movdqa  xmm3, XMMWORD [wk(3)]   ; xmm3=(44 45 46 47 54 55 56 57)
+       movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=tmp6
+       movdqa  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+       movdqa     xmm5,xmm4            ; transpose coefficients(phase 3)
+       punpcklqdq xmm4,xmm0            ; xmm4=(20 21 22 23 24 25 26 27)=data2
+       punpckhqdq xmm5,xmm0            ; xmm5=(30 31 32 33 34 35 36 37)=data3
+       movdqa     xmm6,xmm1            ; transpose coefficients(phase 3)
+       punpcklqdq xmm1,xmm3            ; xmm1=(40 41 42 43 44 45 46 47)=data4
+       punpckhqdq xmm6,xmm3            ; xmm6=(50 51 52 53 54 55 56 57)=data5
+
+       movdqa  xmm0,xmm5
+       movdqa  xmm3,xmm4
+       paddw   xmm5,xmm1               ; xmm5=data3+data4=tmp3
+       paddw   xmm4,xmm6               ; xmm4=data2+data5=tmp2
+       psubw   xmm0,xmm1               ; xmm0=data3-data4=tmp4
+       psubw   xmm3,xmm6               ; xmm3=data2-data5=tmp5
+
+       ; -- Even part
+
+       movdqa  xmm1,xmm7
+       movdqa  xmm6,xmm2
+       paddw   xmm7,xmm5               ; xmm7=tmp10
+       paddw   xmm2,xmm4               ; xmm2=tmp11
+       psubw   xmm1,xmm5               ; xmm1=tmp13
+       psubw   xmm6,xmm4               ; xmm6=tmp12
+
+       movdqa  xmm5,xmm7
+       paddw   xmm7,xmm2               ; xmm7=tmp10+tmp11
+       psubw   xmm5,xmm2               ; xmm5=tmp10-tmp11
+
+       paddw   xmm7,[PW_DESCALE_P2X]
+       paddw   xmm5,[PW_DESCALE_P2X]
+       psraw   xmm7,PASS1_BITS         ; xmm7=data0
+       psraw   xmm5,PASS1_BITS         ; xmm5=data4
+
+       movdqa  XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7
+       movdqa  XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5
+
+       ; (Original)
+       ; z1 = (tmp12 + tmp13) * 0.541196100;
+       ; data2 = z1 + tmp13 * 0.765366865;
+       ; data6 = z1 + tmp12 * -1.847759065;
+       ;
+       ; (This implementation)
+       ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+       ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+       movdqa    xmm4,xmm1             ; xmm1=tmp13
+       movdqa    xmm2,xmm1
+       punpcklwd xmm4,xmm6             ; xmm6=tmp12
+       punpckhwd xmm2,xmm6
+       movdqa    xmm1,xmm4
+       movdqa    xmm6,xmm2
+       pmaddwd   xmm4,[PW_F130_F054]   ; xmm4=data2L
+       pmaddwd   xmm2,[PW_F130_F054]   ; xmm2=data2H
+       pmaddwd   xmm1,[PW_F054_MF130]  ; xmm1=data6L
+       pmaddwd   xmm6,[PW_F054_MF130]  ; xmm6=data6H
+
+       paddd   xmm4,[PD_DESCALE_P2]
+       paddd   xmm2,[PD_DESCALE_P2]
+       psrad   xmm4,DESCALE_P2
+       psrad   xmm2,DESCALE_P2
+       paddd   xmm1,[PD_DESCALE_P2]
+       paddd   xmm6,[PD_DESCALE_P2]
+       psrad   xmm1,DESCALE_P2
+       psrad   xmm6,DESCALE_P2
+
+       packssdw  xmm4,xmm2             ; xmm4=data2
+       packssdw  xmm1,xmm6             ; xmm1=data6
+
+       movdqa  XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4
+       movdqa  XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1
+
+       ; -- Odd part
+
+       movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp6
+       movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
+
+       movdqa  xmm2,xmm0               ; xmm0=tmp4
+       movdqa  xmm6,xmm3               ; xmm3=tmp5
+       paddw   xmm2,xmm7               ; xmm2=z3
+       paddw   xmm6,xmm5               ; xmm6=z4
+
+       ; (Original)
+       ; z5 = (z3 + z4) * 1.175875602;
+       ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+       ; z3 += z5;  z4 += z5;
+       ;
+       ; (This implementation)
+       ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+       ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+       movdqa    xmm4,xmm2
+       movdqa    xmm1,xmm2
+       punpcklwd xmm4,xmm6
+       punpckhwd xmm1,xmm6
+       movdqa    xmm2,xmm4
+       movdqa    xmm6,xmm1
+       pmaddwd   xmm4,[PW_MF078_F117]  ; xmm4=z3L
+       pmaddwd   xmm1,[PW_MF078_F117]  ; xmm1=z3H
+       pmaddwd   xmm2,[PW_F117_F078]   ; xmm2=z4L
+       pmaddwd   xmm6,[PW_F117_F078]   ; xmm6=z4H
+
+       movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=z3L
+       movdqa  XMMWORD [wk(1)], xmm1   ; wk(1)=z3H
+
+       ; (Original)
+       ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+       ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+       ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+       ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+       ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+       ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+       ;
+       ; (This implementation)
+       ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+       ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+       ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+       ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+       ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+       ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+       movdqa    xmm4,xmm0
+       movdqa    xmm1,xmm0
+       punpcklwd xmm4,xmm5
+       punpckhwd xmm1,xmm5
+       movdqa    xmm0,xmm4
+       movdqa    xmm5,xmm1
+       pmaddwd   xmm4,[PW_MF060_MF089] ; xmm4=tmp4L
+       pmaddwd   xmm1,[PW_MF060_MF089] ; xmm1=tmp4H
+       pmaddwd   xmm0,[PW_MF089_F060]  ; xmm0=tmp7L
+       pmaddwd   xmm5,[PW_MF089_F060]  ; xmm5=tmp7H
+
+       paddd   xmm4, XMMWORD [wk(0)]   ; xmm4=data7L
+       paddd   xmm1, XMMWORD [wk(1)]   ; xmm1=data7H
+       paddd   xmm0,xmm2               ; xmm0=data1L
+       paddd   xmm5,xmm6               ; xmm5=data1H
+
+       paddd   xmm4,[PD_DESCALE_P2]
+       paddd   xmm1,[PD_DESCALE_P2]
+       psrad   xmm4,DESCALE_P2
+       psrad   xmm1,DESCALE_P2
+       paddd   xmm0,[PD_DESCALE_P2]
+       paddd   xmm5,[PD_DESCALE_P2]
+       psrad   xmm0,DESCALE_P2
+       psrad   xmm5,DESCALE_P2
+
+       packssdw  xmm4,xmm1             ; xmm4=data7
+       packssdw  xmm0,xmm5             ; xmm0=data1
+
+       movdqa  XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4
+       movdqa  XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0
+
+       movdqa    xmm1,xmm3
+       movdqa    xmm5,xmm3
+       punpcklwd xmm1,xmm7
+       punpckhwd xmm5,xmm7
+       movdqa    xmm3,xmm1
+       movdqa    xmm7,xmm5
+       pmaddwd   xmm1,[PW_MF050_MF256] ; xmm1=tmp5L
+       pmaddwd   xmm5,[PW_MF050_MF256] ; xmm5=tmp5H
+       pmaddwd   xmm3,[PW_MF256_F050]  ; xmm3=tmp6L
+       pmaddwd   xmm7,[PW_MF256_F050]  ; xmm7=tmp6H
+
+       paddd   xmm1,xmm2               ; xmm1=data5L
+       paddd   xmm5,xmm6               ; xmm5=data5H
+       paddd   xmm3, XMMWORD [wk(0)]   ; xmm3=data3L
+       paddd   xmm7, XMMWORD [wk(1)]   ; xmm7=data3H
+
+       paddd   xmm1,[PD_DESCALE_P2]
+       paddd   xmm5,[PD_DESCALE_P2]
+       psrad   xmm1,DESCALE_P2
+       psrad   xmm5,DESCALE_P2
+       paddd   xmm3,[PD_DESCALE_P2]
+       paddd   xmm7,[PD_DESCALE_P2]
+       psrad   xmm3,DESCALE_P2
+       psrad   xmm7,DESCALE_P2
+
+       packssdw  xmm1,xmm5             ; xmm1=data5
+       packssdw  xmm3,xmm7             ; xmm3=data3
+
+       movdqa  XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
+       movdqa  XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
+
+       uncollect_args
+       mov     rsp,rbp         ; rsp <- aligned rbp
+       pop     rsp             ; rsp <- original rbp
+       pop     rbp
+       ret
diff --git a/common/jpeg/simd/jfsseflt-64.asm b/common/jpeg/simd/jfsseflt-64.asm
new file mode 100644 (file)
index 0000000..d8f7246
--- /dev/null
@@ -0,0 +1,354 @@
+;
+; jfsseflt.asm - floating-point FDCT (64-bit SSE)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+       shufps  %1,%2,0x44
+%endmacro
+
+%macro unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+       shufps  %1,%2,0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+       SECTION SEG_CONST
+
+       alignz  16
+       global  EXTN(jconst_fdct_float_sse)
+
+EXTN(jconst_fdct_float_sse):
+
+PD_0_382       times 4 dd  0.382683432365089771728460
+PD_0_707       times 4 dd  0.707106781186547524400844
+PD_0_541       times 4 dd  0.541196100146196984399723
+PD_1_306       times 4 dd  1.306562964876376527856643
+
+       alignz  16
+
+; --------------------------------------------------------------------------
+       SECTION SEG_TEXT
+       BITS    64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_sse (FAST_FLOAT * data)
+;
+
+; r10 = FAST_FLOAT * data
+
+%define wk(i)          rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+
+       align   16
+       global  EXTN(jsimd_fdct_float_sse)
+
+EXTN(jsimd_fdct_float_sse):
+       push    rbp
+       mov     rax,rsp                         ; rax = original rbp
+       sub     rsp, byte 4
+       and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+       mov     [rsp],rax
+       mov     rbp,rsp                         ; rbp = aligned rbp
+       lea     rsp, [wk(0)]
+       collect_args
+
+       ; ---- Pass 1: process rows.
+
+       mov     rdx, r10        ; (FAST_FLOAT *)
+       mov     rcx, DCTSIZE/4
+.rowloop:
+
+       movaps  xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
+       movaps  xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
+       movaps  xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)]
+       movaps  xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)]
+
+       ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
+       ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
+
+       movaps   xmm4,xmm0              ; transpose coefficients(phase 1)
+       unpcklps xmm0,xmm1              ; xmm0=(20 30 21 31)
+       unpckhps xmm4,xmm1              ; xmm4=(22 32 23 33)
+       movaps   xmm5,xmm2              ; transpose coefficients(phase 1)
+       unpcklps xmm2,xmm3              ; xmm2=(24 34 25 35)
+       unpckhps xmm5,xmm3              ; xmm5=(26 36 27 37)
+
+       movaps  xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+       movaps  xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+       movaps  xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
+       movaps  xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
+
+       ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
+       ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
+
+       movaps  XMMWORD [wk(0)], xmm4   ; wk(0)=(22 32 23 33)
+       movaps  XMMWORD [wk(1)], xmm2   ; wk(1)=(24 34 25 35)
+
+       movaps   xmm4,xmm6              ; transpose coefficients(phase 1)
+       unpcklps xmm6,xmm7              ; xmm6=(00 10 01 11)
+       unpckhps xmm4,xmm7              ; xmm4=(02 12 03 13)
+       movaps   xmm2,xmm1              ; transpose coefficients(phase 1)
+       unpcklps xmm1,xmm3              ; xmm1=(04 14 05 15)
+       unpckhps xmm2,xmm3              ; xmm2=(06 16 07 17)
+
+       movaps    xmm7,xmm6             ; transpose coefficients(phase 2)
+       unpcklps2 xmm6,xmm0             ; xmm6=(00 10 20 30)=data0
+       unpckhps2 xmm7,xmm0             ; xmm7=(01 11 21 31)=data1
+       movaps    xmm3,xmm2             ; transpose coefficients(phase 2)
+       unpcklps2 xmm2,xmm5             ; xmm2=(06 16 26 36)=data6
+       unpckhps2 xmm3,xmm5             ; xmm3=(07 17 27 37)=data7
+
+       movaps  xmm0,xmm7
+       movaps  xmm5,xmm6
+       subps   xmm7,xmm2               ; xmm7=data1-data6=tmp6
+       subps   xmm6,xmm3               ; xmm6=data0-data7=tmp7
+       addps   xmm0,xmm2               ; xmm0=data1+data6=tmp1
+       addps   xmm5,xmm3               ; xmm5=data0+data7=tmp0
+
+       movaps  xmm2, XMMWORD [wk(0)]   ; xmm2=(22 32 23 33)
+       movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=(24 34 25 35)
+       movaps  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
+       movaps  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+       movaps    xmm7,xmm4             ; transpose coefficients(phase 2)
+       unpcklps2 xmm4,xmm2             ; xmm4=(02 12 22 32)=data2
+       unpckhps2 xmm7,xmm2             ; xmm7=(03 13 23 33)=data3
+       movaps    xmm6,xmm1             ; transpose coefficients(phase 2)
+       unpcklps2 xmm1,xmm3             ; xmm1=(04 14 24 34)=data4
+       unpckhps2 xmm6,xmm3             ; xmm6=(05 15 25 35)=data5
+
+       movaps  xmm2,xmm7
+       movaps  xmm3,xmm4
+       addps   xmm7,xmm1               ; xmm7=data3+data4=tmp3
+       addps   xmm4,xmm6               ; xmm4=data2+data5=tmp2
+       subps   xmm2,xmm1               ; xmm2=data3-data4=tmp4
+       subps   xmm3,xmm6               ; xmm3=data2-data5=tmp5
+
+       ; -- Even part
+
+       movaps  xmm1,xmm5
+       movaps  xmm6,xmm0
+       subps   xmm5,xmm7               ; xmm5=tmp13
+       subps   xmm0,xmm4               ; xmm0=tmp12
+       addps   xmm1,xmm7               ; xmm1=tmp10
+       addps   xmm6,xmm4               ; xmm6=tmp11
+
+       addps   xmm0,xmm5
+       mulps   xmm0,[PD_0_707] ; xmm0=z1
+
+       movaps  xmm7,xmm1
+       movaps  xmm4,xmm5
+       subps   xmm1,xmm6               ; xmm1=data4
+       subps   xmm5,xmm0               ; xmm5=data6
+       addps   xmm7,xmm6               ; xmm7=data0
+       addps   xmm4,xmm0               ; xmm4=data2
+
+       movaps  XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
+       movaps  XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
+       movaps  XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+       movaps  XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+       ; -- Odd part
+
+       movaps  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
+       movaps  xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
+
+       addps   xmm2,xmm3               ; xmm2=tmp10
+       addps   xmm3,xmm6               ; xmm3=tmp11
+       addps   xmm6,xmm0               ; xmm6=tmp12, xmm0=tmp7
+
+       mulps   xmm3,[PD_0_707] ; xmm3=z3
+
+       movaps  xmm1,xmm2               ; xmm1=tmp10
+       subps   xmm2,xmm6
+       mulps   xmm2,[PD_0_382] ; xmm2=z5
+       mulps   xmm1,[PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+       mulps   xmm6,[PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+       addps   xmm1,xmm2               ; xmm1=z2
+       addps   xmm6,xmm2               ; xmm6=z4
+
+       movaps  xmm5,xmm0
+       subps   xmm0,xmm3               ; xmm0=z13
+       addps   xmm5,xmm3               ; xmm5=z11
+
+       movaps  xmm7,xmm0
+       movaps  xmm4,xmm5
+       subps   xmm0,xmm1               ; xmm0=data3
+       subps   xmm5,xmm6               ; xmm5=data7
+       addps   xmm7,xmm1               ; xmm7=data5
+       addps   xmm4,xmm6               ; xmm4=data1
+
+       movaps  XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
+       movaps  XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
+       movaps  XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7
+       movaps  XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+       add     rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
+       dec     rcx
+       jnz     near .rowloop
+
+       ; ---- Pass 2: process columns.
+
+       mov     rdx, r10        ; (FAST_FLOAT *)
+       mov     rcx, DCTSIZE/4
+.columnloop:
+
+       movaps  xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
+       movaps  xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
+       movaps  xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)]
+       movaps  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)]
+
+       ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
+       ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
+
+       movaps   xmm4,xmm0              ; transpose coefficients(phase 1)
+       unpcklps xmm0,xmm1              ; xmm0=(02 03 12 13)
+       unpckhps xmm4,xmm1              ; xmm4=(22 23 32 33)
+       movaps   xmm5,xmm2              ; transpose coefficients(phase 1)
+       unpcklps xmm2,xmm3              ; xmm2=(42 43 52 53)
+       unpckhps xmm5,xmm3              ; xmm5=(62 63 72 73)
+
+       movaps  xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+       movaps  xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+       movaps  xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)]
+       movaps  xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)]
+
+       ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
+       ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
+
+       movaps  XMMWORD [wk(0)], xmm4   ; wk(0)=(22 23 32 33)
+       movaps  XMMWORD [wk(1)], xmm2   ; wk(1)=(42 43 52 53)
+
+       movaps   xmm4,xmm6              ; transpose coefficients(phase 1)
+       unpcklps xmm6,xmm7              ; xmm6=(00 01 10 11)
+       unpckhps xmm4,xmm7              ; xmm4=(20 21 30 31)
+       movaps   xmm2,xmm1              ; transpose coefficients(phase 1)
+       unpcklps xmm1,xmm3              ; xmm1=(40 41 50 51)
+       unpckhps xmm2,xmm3              ; xmm2=(60 61 70 71)
+
+       movaps    xmm7,xmm6             ; transpose coefficients(phase 2)
+       unpcklps2 xmm6,xmm0             ; xmm6=(00 01 02 03)=data0
+       unpckhps2 xmm7,xmm0             ; xmm7=(10 11 12 13)=data1
+       movaps    xmm3,xmm2             ; transpose coefficients(phase 2)
+       unpcklps2 xmm2,xmm5             ; xmm2=(60 61 62 63)=data6
+       unpckhps2 xmm3,xmm5             ; xmm3=(70 71 72 73)=data7
+
+       movaps  xmm0,xmm7
+       movaps  xmm5,xmm6
+       subps   xmm7,xmm2               ; xmm7=data1-data6=tmp6
+       subps   xmm6,xmm3               ; xmm6=data0-data7=tmp7
+       addps   xmm0,xmm2               ; xmm0=data1+data6=tmp1
+       addps   xmm5,xmm3               ; xmm5=data0+data7=tmp0
+
+       movaps  xmm2, XMMWORD [wk(0)]   ; xmm2=(22 23 32 33)
+       movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53)
+       movaps  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
+       movaps  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+       movaps    xmm7,xmm4             ; transpose coefficients(phase 2)
+       unpcklps2 xmm4,xmm2             ; xmm4=(20 21 22 23)=data2
+       unpckhps2 xmm7,xmm2             ; xmm7=(30 31 32 33)=data3
+       movaps    xmm6,xmm1             ; transpose coefficients(phase 2)
+       unpcklps2 xmm1,xmm3             ; xmm1=(40 41 42 43)=data4
+       unpckhps2 xmm6,xmm3             ; xmm6=(50 51 52 53)=data5
+
+       movaps  xmm2,xmm7
+       movaps  xmm3,xmm4
+       addps   xmm7,xmm1               ; xmm7=data3+data4=tmp3
+       addps   xmm4,xmm6               ; xmm4=data2+data5=tmp2
+       subps   xmm2,xmm1               ; xmm2=data3-data4=tmp4
+       subps   xmm3,xmm6               ; xmm3=data2-data5=tmp5
+
+       ; -- Even part
+
+       movaps  xmm1,xmm5
+       movaps  xmm6,xmm0
+       subps   xmm5,xmm7               ; xmm5=tmp13
+       subps   xmm0,xmm4               ; xmm0=tmp12
+       addps   xmm1,xmm7               ; xmm1=tmp10
+       addps   xmm6,xmm4               ; xmm6=tmp11
+
+       addps   xmm0,xmm5
+       mulps   xmm0,[PD_0_707] ; xmm0=z1
+
+       movaps  xmm7,xmm1
+       movaps  xmm4,xmm5
+       subps   xmm1,xmm6               ; xmm1=data4
+       subps   xmm5,xmm0               ; xmm5=data6
+       addps   xmm7,xmm6               ; xmm7=data0
+       addps   xmm4,xmm0               ; xmm4=data2
+
+       movaps  XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
+       movaps  XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
+       movaps  XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+       movaps  XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+       ; -- Odd part
+
+       movaps  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
+       movaps  xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
+
+       addps   xmm2,xmm3               ; xmm2=tmp10
+       addps   xmm3,xmm6               ; xmm3=tmp11
+       addps   xmm6,xmm0               ; xmm6=tmp12, xmm0=tmp7
+
+       mulps   xmm3,[PD_0_707] ; xmm3=z3
+
+       movaps  xmm1,xmm2               ; xmm1=tmp10
+       subps   xmm2,xmm6
+       mulps   xmm2,[PD_0_382] ; xmm2=z5
+       mulps   xmm1,[PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+       mulps   xmm6,[PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+       addps   xmm1,xmm2               ; xmm1=z2
+       addps   xmm6,xmm2               ; xmm6=z4
+
+       movaps  xmm5,xmm0
+       subps   xmm0,xmm3               ; xmm0=z13
+       addps   xmm5,xmm3               ; xmm5=z11
+
+       movaps  xmm7,xmm0
+       movaps  xmm4,xmm5
+       subps   xmm0,xmm1               ; xmm0=data3
+       subps   xmm5,xmm6               ; xmm5=data7
+       addps   xmm7,xmm1               ; xmm7=data5
+       addps   xmm4,xmm6               ; xmm4=data1
+
+       movaps  XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
+       movaps  XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
+       movaps  XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+       movaps  XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+       add     rdx, byte 4*SIZEOF_FAST_FLOAT
+       dec     rcx
+       jnz     near .columnloop
+
+       uncollect_args
+       mov     rsp,rbp         ; rsp <- aligned rbp
+       pop     rsp             ; rsp <- original rbp
+       pop     rbp
+       ret
diff --git a/common/jpeg/simd/jiss2flt-64.asm b/common/jpeg/simd/jiss2flt-64.asm
new file mode 100644 (file)
index 0000000..572909d
--- /dev/null
@@ -0,0 +1,479 @@
+;
+; jiss2flt.asm - floating-point IDCT (64-bit SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+       shufps  %1,%2,0x44
+%endmacro
+
+%macro unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+       shufps  %1,%2,0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+       SECTION SEG_CONST
+
+       alignz  16
+       global  EXTN(jconst_idct_float_sse2)
+
+EXTN(jconst_idct_float_sse2):
+
+PD_1_414       times 4 dd  1.414213562373095048801689
+PD_1_847       times 4 dd  1.847759065022573512256366
+PD_1_082       times 4 dd  1.082392200292393968799446
+PD_M2_613      times 4 dd -2.613125929752753055713286
+PD_RNDINT_MAGIC        times 4 dd  100663296.0 ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+       alignz  16
+
+; --------------------------------------------------------------------------
+       SECTION SEG_TEXT
+       BITS    64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block,
+;                        JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void * dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13 = JDIMENSION output_col
+
+%define original_rbp   rbp+0
+%define wk(i)          rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+%define workspace      wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
+                                       ; FAST_FLOAT workspace[DCTSIZE2]
+
+       align   16
+       global  EXTN(jsimd_idct_float_sse2)
+
+EXTN(jsimd_idct_float_sse2):
+       push    rbp
+       mov     rax,rsp                         ; rax = original rbp
+       sub     rsp, byte 4
+       and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+       mov     [rsp],eax
+       mov     rbp,rsp                         ; rbp = aligned rbp
+       lea     rsp, [workspace]
+       push    rbx
+       collect_args
+
+       ; ---- Pass 1: process columns from input, store into work array.
+
+       mov     rdx, r10        ; quantptr
+       mov     rsi, r11                ; inptr
+       lea     rdi, [workspace]                        ; FAST_FLOAT * wsptr
+       mov     rcx, DCTSIZE/4                          ; ctr
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+       mov     eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+       or      eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+       jnz     near .columnDCT
+
+       movq    xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+       movq    xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+       movq    xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+       movq    xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+       movq    xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+       movq    xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+       movq    xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+       por     xmm1,xmm2
+       por     xmm3,xmm4
+       por     xmm5,xmm6
+       por     xmm1,xmm3
+       por     xmm5,xmm7
+       por     xmm1,xmm5
+       packsswb xmm1,xmm1
+       movd    eax,xmm1
+       test    rax,rax
+       jnz     short .columnDCT
+
+       ; -- AC terms all zero
+
+       movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+
+       punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
+       psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
+       cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
+
+       mulps   xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+       movaps  xmm1,xmm0
+       movaps  xmm2,xmm0
+       movaps  xmm3,xmm0
+
+       shufps  xmm0,xmm0,0x00                  ; xmm0=(00 00 00 00)
+       shufps  xmm1,xmm1,0x55                  ; xmm1=(01 01 01 01)
+       shufps  xmm2,xmm2,0xAA                  ; xmm2=(02 02 02 02)
+       shufps  xmm3,xmm3,0xFF                  ; xmm3=(03 03 03 03)
+
+       movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
+       movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
+       movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
+       movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
+       movaps  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
+       movaps  XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
+       movaps  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+       movaps  XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
+       jmp     near .nextcolumn
+%endif
+.columnDCT:
+
+       ; -- Even part
+
+       movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+       movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+       movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+       movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+
+       punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
+       punpcklwd xmm1,xmm1             ; xmm1=(20 20 21 21 22 22 23 23)
+       psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
+       psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in2=(20 21 22 23)
+       cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
+       cvtdq2ps  xmm1,xmm1                     ; xmm1=in2=(20 21 22 23)
+
+       punpcklwd xmm2,xmm2             ; xmm2=(40 40 41 41 42 42 43 43)
+       punpcklwd xmm3,xmm3             ; xmm3=(60 60 61 61 62 62 63 63)
+       psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in4=(40 41 42 43)
+       psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in6=(60 61 62 63)
+       cvtdq2ps  xmm2,xmm2                     ; xmm2=in4=(40 41 42 43)
+       cvtdq2ps  xmm3,xmm3                     ; xmm3=in6=(60 61 62 63)
+
+       mulps     xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+       mulps     xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+       mulps     xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+       mulps     xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+       movaps  xmm4,xmm0
+       movaps  xmm5,xmm1
+       subps   xmm0,xmm2               ; xmm0=tmp11
+       subps   xmm1,xmm3
+       addps   xmm4,xmm2               ; xmm4=tmp10
+       addps   xmm5,xmm3               ; xmm5=tmp13
+
+       mulps   xmm1,[PD_1_414]
+       subps   xmm1,xmm5               ; xmm1=tmp12
+
+       movaps  xmm6,xmm4
+       movaps  xmm7,xmm0
+       subps   xmm4,xmm5               ; xmm4=tmp3
+       subps   xmm0,xmm1               ; xmm0=tmp2
+       addps   xmm6,xmm5               ; xmm6=tmp0
+       addps   xmm7,xmm1               ; xmm7=tmp1
+
+       movaps  XMMWORD [wk(1)], xmm4   ; tmp3
+       movaps  XMMWORD [wk(0)], xmm0   ; tmp2
+
+       ; -- Odd part
+
+       movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+       movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+       movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+       movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+
+       punpcklwd xmm2,xmm2             ; xmm2=(10 10 11 11 12 12 13 13)
+       punpcklwd xmm3,xmm3             ; xmm3=(30 30 31 31 32 32 33 33)
+       psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in1=(10 11 12 13)
+       psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in3=(30 31 32 33)
+       cvtdq2ps  xmm2,xmm2                     ; xmm2=in1=(10 11 12 13)
+       cvtdq2ps  xmm3,xmm3                     ; xmm3=in3=(30 31 32 33)
+
+       punpcklwd xmm5,xmm5             ; xmm5=(50 50 51 51 52 52 53 53)
+       punpcklwd xmm1,xmm1             ; xmm1=(70 70 71 71 72 72 73 73)
+       psrad     xmm5,(DWORD_BIT-WORD_BIT)     ; xmm5=in5=(50 51 52 53)
+       psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in7=(70 71 72 73)
+       cvtdq2ps  xmm5,xmm5                     ; xmm5=in5=(50 51 52 53)
+       cvtdq2ps  xmm1,xmm1                     ; xmm1=in7=(70 71 72 73)
+
+       mulps     xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+       mulps     xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+       mulps     xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+       mulps     xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+       movaps  xmm4,xmm2
+       movaps  xmm0,xmm5
+       addps   xmm2,xmm1               ; xmm2=z11
+       addps   xmm5,xmm3               ; xmm5=z13
+       subps   xmm4,xmm1               ; xmm4=z12
+       subps   xmm0,xmm3               ; xmm0=z10
+
+       movaps  xmm1,xmm2
+       subps   xmm2,xmm5
+       addps   xmm1,xmm5               ; xmm1=tmp7
+
+       mulps   xmm2,[PD_1_414] ; xmm2=tmp11
+
+       movaps  xmm3,xmm0
+       addps   xmm0,xmm4
+       mulps   xmm0,[PD_1_847] ; xmm0=z5
+       mulps   xmm3,[PD_M2_613]        ; xmm3=(z10 * -2.613125930)
+       mulps   xmm4,[PD_1_082] ; xmm4=(z12 * 1.082392200)
+       addps   xmm3,xmm0               ; xmm3=tmp12
+       subps   xmm4,xmm0               ; xmm4=tmp10
+
+       ; -- Final output stage
+
+       subps   xmm3,xmm1               ; xmm3=tmp6
+       movaps  xmm5,xmm6
+       movaps  xmm0,xmm7
+       addps   xmm6,xmm1               ; xmm6=data0=(00 01 02 03)
+       addps   xmm7,xmm3               ; xmm7=data1=(10 11 12 13)
+       subps   xmm5,xmm1               ; xmm5=data7=(70 71 72 73)
+       subps   xmm0,xmm3               ; xmm0=data6=(60 61 62 63)
+       subps   xmm2,xmm3               ; xmm2=tmp5
+
+       movaps    xmm1,xmm6             ; transpose coefficients(phase 1)
+       unpcklps  xmm6,xmm7             ; xmm6=(00 10 01 11)
+       unpckhps  xmm1,xmm7             ; xmm1=(02 12 03 13)
+       movaps    xmm3,xmm0             ; transpose coefficients(phase 1)
+       unpcklps  xmm0,xmm5             ; xmm0=(60 70 61 71)
+       unpckhps  xmm3,xmm5             ; xmm3=(62 72 63 73)
+
+       movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+       movaps  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
+
+       movaps  XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
+       movaps  XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
+
+       addps   xmm4,xmm2               ; xmm4=tmp4
+       movaps  xmm0,xmm7
+       movaps  xmm3,xmm5
+       addps   xmm7,xmm2               ; xmm7=data2=(20 21 22 23)
+       addps   xmm5,xmm4               ; xmm5=data4=(40 41 42 43)
+       subps   xmm0,xmm2               ; xmm0=data5=(50 51 52 53)
+       subps   xmm3,xmm4               ; xmm3=data3=(30 31 32 33)
+
+       movaps    xmm2,xmm7             ; transpose coefficients(phase 1)
+       unpcklps  xmm7,xmm3             ; xmm7=(20 30 21 31)
+       unpckhps  xmm2,xmm3             ; xmm2=(22 32 23 33)
+       movaps    xmm4,xmm5             ; transpose coefficients(phase 1)
+       unpcklps  xmm5,xmm0             ; xmm5=(40 50 41 51)
+       unpckhps  xmm4,xmm0             ; xmm4=(42 52 43 53)
+
+       movaps    xmm3,xmm6             ; transpose coefficients(phase 2)
+       unpcklps2 xmm6,xmm7             ; xmm6=(00 10 20 30)
+       unpckhps2 xmm3,xmm7             ; xmm3=(01 11 21 31)
+       movaps    xmm0,xmm1             ; transpose coefficients(phase 2)
+       unpcklps2 xmm1,xmm2             ; xmm1=(02 12 22 32)
+       unpckhps2 xmm0,xmm2             ; xmm0=(03 13 23 33)
+
+       movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
+       movaps  xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
+
+       movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
+       movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+       movaps  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
+       movaps  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
+
+       movaps    xmm6,xmm5             ; transpose coefficients(phase 2)
+       unpcklps2 xmm5,xmm7             ; xmm5=(40 50 60 70)
+       unpckhps2 xmm6,xmm7             ; xmm6=(41 51 61 71)
+       movaps    xmm3,xmm4             ; transpose coefficients(phase 2)
+       unpcklps2 xmm4,xmm2             ; xmm4=(42 52 62 72)
+       unpckhps2 xmm3,xmm2             ; xmm3=(43 53 63 73)
+
+       movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
+       movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
+       movaps  XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
+       movaps  XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+       add     rsi, byte 4*SIZEOF_JCOEF                ; coef_block
+       add     rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE      ; quantptr
+       add     rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT   ; wsptr
+       dec     rcx                                     ; ctr
+       jnz     near .columnloop
+
+       ; -- Prefetch the next coefficient block
+
+       prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+       prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+       prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+       prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+       ; ---- Pass 2: process rows from work array, store into output array.
+
+       mov     rax, [original_rbp]
+       lea     rsi, [workspace]                        ; FAST_FLOAT * wsptr
+       mov     rdi, r12        ; (JSAMPROW *)
+       mov     rax, r13
+       mov     rcx, DCTSIZE/4                          ; ctr
+.rowloop:
+
+       ; -- Even part
+
+       movaps  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
+       movaps  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
+       movaps  xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
+       movaps  xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
+
+       movaps  xmm4,xmm0
+       movaps  xmm5,xmm1
+       subps   xmm0,xmm2               ; xmm0=tmp11
+       subps   xmm1,xmm3
+       addps   xmm4,xmm2               ; xmm4=tmp10
+       addps   xmm5,xmm3               ; xmm5=tmp13
+
+       mulps   xmm1,[PD_1_414]
+       subps   xmm1,xmm5               ; xmm1=tmp12
+
+       movaps  xmm6,xmm4
+       movaps  xmm7,xmm0
+       subps   xmm4,xmm5               ; xmm4=tmp3
+       subps   xmm0,xmm1               ; xmm0=tmp2
+       addps   xmm6,xmm5               ; xmm6=tmp0
+       addps   xmm7,xmm1               ; xmm7=tmp1
+
+       movaps  XMMWORD [wk(1)], xmm4   ; tmp3
+       movaps  XMMWORD [wk(0)], xmm0   ; tmp2
+
+       ; -- Odd part
+
+       movaps  xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
+       movaps  xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
+       movaps  xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
+       movaps  xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
+
+       movaps  xmm4,xmm2
+       movaps  xmm0,xmm5
+       addps   xmm2,xmm1               ; xmm2=z11
+       addps   xmm5,xmm3               ; xmm5=z13
+       subps   xmm4,xmm1               ; xmm4=z12
+       subps   xmm0,xmm3               ; xmm0=z10
+
+       movaps  xmm1,xmm2
+       subps   xmm2,xmm5
+       addps   xmm1,xmm5               ; xmm1=tmp7
+
+       mulps   xmm2,[PD_1_414] ; xmm2=tmp11
+
+       movaps  xmm3,xmm0
+       addps   xmm0,xmm4
+       mulps   xmm0,[PD_1_847] ; xmm0=z5
+       mulps   xmm3,[PD_M2_613]        ; xmm3=(z10 * -2.613125930)
+       mulps   xmm4,[PD_1_082] ; xmm4=(z12 * 1.082392200)
+       addps   xmm3,xmm0               ; xmm3=tmp12
+       subps   xmm4,xmm0               ; xmm4=tmp10
+
+       ; -- Final output stage
+
+       subps   xmm3,xmm1               ; xmm3=tmp6
+       movaps  xmm5,xmm6
+       movaps  xmm0,xmm7
+       addps   xmm6,xmm1               ; xmm6=data0=(00 10 20 30)
+       addps   xmm7,xmm3               ; xmm7=data1=(01 11 21 31)
+       subps   xmm5,xmm1               ; xmm5=data7=(07 17 27 37)
+       subps   xmm0,xmm3               ; xmm0=data6=(06 16 26 36)
+       subps   xmm2,xmm3               ; xmm2=tmp5
+
+       movaps  xmm1,[PD_RNDINT_MAGIC]  ; xmm1=[PD_RNDINT_MAGIC]
+       pcmpeqd xmm3,xmm3
+       psrld   xmm3,WORD_BIT           ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+       addps   xmm6,xmm1       ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
+       addps   xmm7,xmm1       ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
+       addps   xmm0,xmm1       ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
+       addps   xmm5,xmm1       ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
+
+       pand    xmm6,xmm3               ; xmm6=(00 -- 10 -- 20 -- 30 --)
+       pslld   xmm7,WORD_BIT           ; xmm7=(-- 01 -- 11 -- 21 -- 31)
+       pand    xmm0,xmm3               ; xmm0=(06 -- 16 -- 26 -- 36 --)
+       pslld   xmm5,WORD_BIT           ; xmm5=(-- 07 -- 17 -- 27 -- 37)
+       por     xmm6,xmm7               ; xmm6=(00 01 10 11 20 21 30 31)
+       por     xmm0,xmm5               ; xmm0=(06 07 16 17 26 27 36 37)
+
+       movaps  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp2
+       movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=tmp3
+
+       addps   xmm4,xmm2               ; xmm4=tmp4
+       movaps  xmm7,xmm1
+       movaps  xmm5,xmm3
+       addps   xmm1,xmm2               ; xmm1=data2=(02 12 22 32)
+       addps   xmm3,xmm4               ; xmm3=data4=(04 14 24 34)
+       subps   xmm7,xmm2               ; xmm7=data5=(05 15 25 35)
+       subps   xmm5,xmm4               ; xmm5=data3=(03 13 23 33)
+
+       movaps  xmm2,[PD_RNDINT_MAGIC]  ; xmm2=[PD_RNDINT_MAGIC]
+       pcmpeqd xmm4,xmm4
+       psrld   xmm4,WORD_BIT           ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+       addps   xmm3,xmm2       ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
+       addps   xmm7,xmm2       ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
+       addps   xmm1,xmm2       ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
+       addps   xmm5,xmm2       ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
+
+       pand    xmm3,xmm4               ; xmm3=(04 -- 14 -- 24 -- 34 --)
+       pslld   xmm7,WORD_BIT           ; xmm7=(-- 05 -- 15 -- 25 -- 35)
+       pand    xmm1,xmm4               ; xmm1=(02 -- 12 -- 22 -- 32 --)
+       pslld   xmm5,WORD_BIT           ; xmm5=(-- 03 -- 13 -- 23 -- 33)
+       por     xmm3,xmm7               ; xmm3=(04 05 14 15 24 25 34 35)
+       por     xmm1,xmm5               ; xmm1=(02 03 12 13 22 23 32 33)
+
+       movdqa    xmm2,[PB_CENTERJSAMP] ; xmm2=[PB_CENTERJSAMP]
+
+       packsswb  xmm6,xmm3     ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
+       packsswb  xmm1,xmm0     ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
+       paddb     xmm6,xmm2
+       paddb     xmm1,xmm2
+
+       movdqa    xmm4,xmm6     ; transpose coefficients(phase 2)
+       punpcklwd xmm6,xmm1     ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+       punpckhwd xmm4,xmm1     ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+
+       movdqa    xmm7,xmm6     ; transpose coefficients(phase 3)
+       punpckldq xmm6,xmm4     ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+       punpckhdq xmm7,xmm4     ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+
+       pshufd  xmm5,xmm6,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+       pshufd  xmm3,xmm7,0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+
+       mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+       mov     rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+       movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+       movq    XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
+       mov     rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+       mov     rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+       movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
+       movq    XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
+
+       add     rsi, byte 4*SIZEOF_FAST_FLOAT   ; wsptr
+       add     rdi, byte 4*SIZEOF_JSAMPROW
+       dec     rcx                             ; ctr
+       jnz     near .rowloop
+
+       uncollect_args
+       pop     rbx
+       mov     rsp,rbp         ; rsp <- aligned rbp
+       pop     rsp             ; rsp <- original rbp
+       pop     rbp
+       ret
diff --git a/common/jpeg/simd/jiss2fst-64.asm b/common/jpeg/simd/jiss2fst-64.asm
new file mode 100644 (file)
index 0000000..97dfa76
--- /dev/null
@@ -0,0 +1,488 @@
+;
+; jiss2fst.asm - fast integer IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/projecpt/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS     8       ; 14 is also OK.
+%define PASS1_BITS     2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082        equ     277             ; FIX(1.082392200)
+F_1_414        equ     362             ; FIX(1.414213562)
+F_1_847        equ     473             ; FIX(1.847759065)
+F_2_613        equ     669             ; FIX(2.613125930)
+F_1_613        equ     (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define        DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_1_082        equ     DESCALE(1162209775,30-CONST_BITS)       ; FIX(1.082392200)
+F_1_414        equ     DESCALE(1518500249,30-CONST_BITS)       ; FIX(1.414213562)
+F_1_847        equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
+F_2_613        equ     DESCALE(2805822602,30-CONST_BITS)       ; FIX(2.613125930)
+F_1_613        equ     (F_2_613 - (1 << CONST_BITS))   ; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+       SECTION SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS   2
+%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+       alignz  16
+       global  EXTN(jconst_idct_ifast_sse2)
+
+EXTN(jconst_idct_ifast_sse2):
+
+PW_F1414       times 8 dw  F_1_414 << CONST_SHIFT
+PW_F1847       times 8 dw  F_1_847 << CONST_SHIFT
+PW_MF1613      times 8 dw -F_1_613 << CONST_SHIFT
+PW_F1082       times 8 dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+       alignz  16
+
+; --------------------------------------------------------------------------
+       SECTION SEG_TEXT
+       BITS    64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_sse2 (void * dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info * compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13 = JDIMENSION output_col
+
+%define original_rbp   rbp+0
+%define wk(i)          rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+
+       align   16
+       global  EXTN(jsimd_idct_ifast_sse2)
+
+EXTN(jsimd_idct_ifast_sse2):
+       push    rbp
+       mov     rax,rsp                         ; rax = original rbp
+       sub     rsp, byte 4
+       and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+       mov     [rsp],eax
+       mov     rbp,rsp                         ; rbp = aligned rbp
+       lea     rsp, [wk(0)]
+       collect_args
+
+       ; ---- Pass 1: process columns from input.
+
+       mov     rdx, r10        ; quantptr
+       mov     rsi, r11                ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
+       mov     eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+       or      eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+       jnz     near .columnDCT
+
+       movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+       movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+       por     xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+       por     xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+       por     xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+       por     xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+       por     xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+       por     xmm1,xmm0
+       packsswb xmm1,xmm1
+       packsswb xmm1,xmm1
+       movd    eax,xmm1
+       test    rax,rax
+       jnz     short .columnDCT
+
+       ; -- AC terms all zero
+
+       movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+       pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+       movdqa    xmm7,xmm0             ; xmm0=in0=(00 01 02 03 04 05 06 07)
+       punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
+       punpckhwd xmm7,xmm7             ; xmm7=(04 04 05 05 06 06 07 07)
+
+       pshufd  xmm6,xmm0,0x00          ; xmm6=col0=(00 00 00 00 00 00 00 00)
+       pshufd  xmm2,xmm0,0x55          ; xmm2=col1=(01 01 01 01 01 01 01 01)
+       pshufd  xmm5,xmm0,0xAA          ; xmm5=col2=(02 02 02 02 02 02 02 02)
+       pshufd  xmm0,xmm0,0xFF          ; xmm0=col3=(03 03 03 03 03 03 03 03)
+       pshufd  xmm1,xmm7,0x00          ; xmm1=col4=(04 04 04 04 04 04 04 04)
+       pshufd  xmm4,xmm7,0x55          ; xmm4=col5=(05 05 05 05 05 05 05 05)
+       pshufd  xmm3,xmm7,0xAA          ; xmm3=col6=(06 06 06 06 06 06 06 06)
+       pshufd  xmm7,xmm7,0xFF          ; xmm7=col7=(07 07 07 07 07 07 07 07)
+
+       movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=col1
+       movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=col3
+       jmp     near .column_end
+%endif
+.columnDCT:
+
+       ; -- Even part
+
+       movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+       movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+       pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+       pmullw  xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+       movdqa  xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+       movdqa  xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+       pmullw  xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+       pmullw  xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+
+       movdqa  xmm4,xmm0
+       movdqa  xmm5,xmm1
+       psubw   xmm0,xmm2               ; xmm0=tmp11
+       psubw   xmm1,xmm3
+       paddw   xmm4,xmm2               ; xmm4=tmp10
+       paddw   xmm5,xmm3               ; xmm5=tmp13
+
+       psllw   xmm1,PRE_MULTIPLY_SCALE_BITS
+       pmulhw  xmm1,[PW_F1414]
+       psubw   xmm1,xmm5               ; xmm1=tmp12
+
+       movdqa  xmm6,xmm4
+       movdqa  xmm7,xmm0
+       psubw   xmm4,xmm5               ; xmm4=tmp3
+       psubw   xmm0,xmm1               ; xmm0=tmp2
+       paddw   xmm6,xmm5               ; xmm6=tmp0
+       paddw   xmm7,xmm1               ; xmm7=tmp1
+
+       movdqa  XMMWORD [wk(1)], xmm4   ; wk(1)=tmp3
+       movdqa  XMMWORD [wk(0)], xmm0   ; wk(0)=tmp2
+
+       ; -- Odd part
+
+       movdqa  xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+       movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+       pmullw  xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+       pmullw  xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+       movdqa  xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+       movdqa  xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+       pmullw  xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+       pmullw  xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+
+       movdqa  xmm4,xmm2
+       movdqa  xmm0,xmm5
+       psubw   xmm2,xmm1               ; xmm2=z12
+       psubw   xmm5,xmm3               ; xmm5=z10
+       paddw   xmm4,xmm1               ; xmm4=z11
+       paddw   xmm0,xmm3               ; xmm0=z13
+
+       movdqa  xmm1,xmm5               ; xmm1=z10(unscaled)
+       psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
+       psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
+
+       movdqa  xmm3,xmm4
+       psubw   xmm4,xmm0
+       paddw   xmm3,xmm0               ; xmm3=tmp7
+
+       psllw   xmm4,PRE_MULTIPLY_SCALE_BITS
+       pmulhw  xmm4,[PW_F1414] ; xmm4=tmp11
+
+       ; To avoid overflow...
+       ;
+       ; (Original)
+       ; tmp12 = -2.613125930 * z10 + z5;
+       ;
+       ; (This implementation)
+       ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+       ;       = -1.613125930 * z10 - z10 + z5;
+
+       movdqa  xmm0,xmm5
+       paddw   xmm5,xmm2
+       pmulhw  xmm5,[PW_F1847] ; xmm5=z5
+       pmulhw  xmm0,[PW_MF1613]
+       pmulhw  xmm2,[PW_F1082]
+       psubw   xmm0,xmm1
+       psubw   xmm2,xmm5               ; xmm2=tmp10
+       paddw   xmm0,xmm5               ; xmm0=tmp12
+
+       ; -- Final output stage
+
+       psubw   xmm0,xmm3               ; xmm0=tmp6
+       movdqa  xmm1,xmm6
+       movdqa  xmm5,xmm7
+       paddw   xmm6,xmm3               ; xmm6=data0=(00 01 02 03 04 05 06 07)
+       paddw   xmm7,xmm0               ; xmm7=data1=(10 11 12 13 14 15 16 17)
+       psubw   xmm1,xmm3               ; xmm1=data7=(70 71 72 73 74 75 76 77)
+       psubw   xmm5,xmm0               ; xmm5=data6=(60 61 62 63 64 65 66 67)
+       psubw   xmm4,xmm0               ; xmm4=tmp5
+
+       movdqa    xmm3,xmm6             ; transpose coefficients(phase 1)
+       punpcklwd xmm6,xmm7             ; xmm6=(00 10 01 11 02 12 03 13)
+       punpckhwd xmm3,xmm7             ; xmm3=(04 14 05 15 06 16 07 17)
+       movdqa    xmm0,xmm5             ; transpose coefficients(phase 1)
+       punpcklwd xmm5,xmm1             ; xmm5=(60 70 61 71 62 72 63 73)
+       punpckhwd xmm0,xmm1             ; xmm0=(64 74 65 75 66 76 67 77)
+
+       movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+       movdqa  xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
+
+       movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=(60 70 61 71 62 72 63 73)
+       movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(64 74 65 75 66 76 67 77)
+
+       paddw   xmm2,xmm4               ; xmm2=tmp4
+       movdqa  xmm5,xmm7
+       movdqa  xmm0,xmm1
+       paddw   xmm7,xmm4               ; xmm7=data2=(20 21 22 23 24 25 26 27)
+       paddw   xmm1,xmm2               ; xmm1=data4=(40 41 42 43 44 45 46 47)
+       psubw   xmm5,xmm4               ; xmm5=data5=(50 51 52 53 54 55 56 57)
+       psubw   xmm0,xmm2               ; xmm0=data3=(30 31 32 33 34 35 36 37)
+
+       movdqa    xmm4,xmm7             ; transpose coefficients(phase 1)
+       punpcklwd xmm7,xmm0             ; xmm7=(20 30 21 31 22 32 23 33)
+       punpckhwd xmm4,xmm0             ; xmm4=(24 34 25 35 26 36 27 37)
+       movdqa    xmm2,xmm1             ; transpose coefficients(phase 1)
+       punpcklwd xmm1,xmm5             ; xmm1=(40 50 41 51 42 52 43 53)
+       punpckhwd xmm2,xmm5             ; xmm2=(44 54 45 55 46 56 47 57)
+
+       movdqa    xmm0,xmm3             ; transpose coefficients(phase 2)
+       punpckldq xmm3,xmm4             ; xmm3=(04 14 24 34 05 15 25 35)
+       punpckhdq xmm0,xmm4             ; xmm0=(06 16 26 36 07 17 27 37)
+       movdqa    xmm5,xmm6             ; transpose coefficients(phase 2)
+       punpckldq xmm6,xmm7             ; xmm6=(00 10 20 30 01 11 21 31)
+       punpckhdq xmm5,xmm7             ; xmm5=(02 12 22 32 03 13 23 33)
+
+       movdqa  xmm4, XMMWORD [wk(0)]   ; xmm4=(60 70 61 71 62 72 63 73)
+       movdqa  xmm7, XMMWORD [wk(1)]   ; xmm7=(64 74 65 75 66 76 67 77)
+
+       movdqa  XMMWORD [wk(0)], xmm3   ; wk(0)=(04 14 24 34 05 15 25 35)
+       movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(06 16 26 36 07 17 27 37)
+
+       movdqa    xmm3,xmm1             ; transpose coefficients(phase 2)
+       punpckldq xmm1,xmm4             ; xmm1=(40 50 60 70 41 51 61 71)
+       punpckhdq xmm3,xmm4             ; xmm3=(42 52 62 72 43 53 63 73)
+       movdqa    xmm0,xmm2             ; transpose coefficients(phase 2)
+       punpckldq xmm2,xmm7             ; xmm2=(44 54 64 74 45 55 65 75)
+       punpckhdq xmm0,xmm7             ; xmm0=(46 56 66 76 47 57 67 77)
+
+       movdqa     xmm4,xmm6            ; transpose coefficients(phase 3)
+       punpcklqdq xmm6,xmm1            ; xmm6=col0=(00 10 20 30 40 50 60 70)
+       punpckhqdq xmm4,xmm1            ; xmm4=col1=(01 11 21 31 41 51 61 71)
+       movdqa     xmm7,xmm5            ; transpose coefficients(phase 3)
+       punpcklqdq xmm5,xmm3            ; xmm5=col2=(02 12 22 32 42 52 62 72)
+       punpckhqdq xmm7,xmm3            ; xmm7=col3=(03 13 23 33 43 53 63 73)
+
+       movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(04 14 24 34 05 15 25 35)
+       movdqa  xmm3, XMMWORD [wk(1)]   ; xmm3=(06 16 26 36 07 17 27 37)
+
+       movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=col1
+       movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=col3
+
+       movdqa     xmm4,xmm1            ; transpose coefficients(phase 3)
+       punpcklqdq xmm1,xmm2            ; xmm1=col4=(04 14 24 34 44 54 64 74)
+       punpckhqdq xmm4,xmm2            ; xmm4=col5=(05 15 25 35 45 55 65 75)
+       movdqa     xmm7,xmm3            ; transpose coefficients(phase 3)
+       punpcklqdq xmm3,xmm0            ; xmm3=col6=(06 16 26 36 46 56 66 76)
+       punpckhqdq xmm7,xmm0            ; xmm7=col7=(07 17 27 37 47 57 67 77)
+.column_end:
+
+       ; -- Prefetch the next coefficient block
+
+       prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+       prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+       prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+       prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+       ; ---- Pass 2: process rows from work array, store into output array.
+
+       mov     rax, [original_rbp]
+       mov     rdi, r12        ; (JSAMPROW *)
+       mov     rax, r13
+
+       ; -- Even part
+
+       ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
+
+       movdqa  xmm2,xmm6
+       movdqa  xmm0,xmm5
+       psubw   xmm6,xmm1               ; xmm6=tmp11
+       psubw   xmm5,xmm3
+       paddw   xmm2,xmm1               ; xmm2=tmp10
+       paddw   xmm0,xmm3               ; xmm0=tmp13
+
+       psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
+       pmulhw  xmm5,[PW_F1414]
+       psubw   xmm5,xmm0               ; xmm5=tmp12
+
+       movdqa  xmm1,xmm2
+       movdqa  xmm3,xmm6
+       psubw   xmm2,xmm0               ; xmm2=tmp3
+       psubw   xmm6,xmm5               ; xmm6=tmp2
+       paddw   xmm1,xmm0               ; xmm1=tmp0
+       paddw   xmm3,xmm5               ; xmm3=tmp1
+
+       movdqa  xmm0, XMMWORD [wk(0)]   ; xmm0=col1
+       movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=col3
+
+       movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=tmp3
+       movdqa  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp2
+
+       ; -- Odd part
+
+       ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
+
+       movdqa  xmm2,xmm0
+       movdqa  xmm6,xmm4
+       psubw   xmm0,xmm7               ; xmm0=z12
+       psubw   xmm4,xmm5               ; xmm4=z10
+       paddw   xmm2,xmm7               ; xmm2=z11
+       paddw   xmm6,xmm5               ; xmm6=z13
+
+       movdqa  xmm7,xmm4               ; xmm7=z10(unscaled)
+       psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
+       psllw   xmm4,PRE_MULTIPLY_SCALE_BITS
+
+       movdqa  xmm5,xmm2
+       psubw   xmm2,xmm6
+       paddw   xmm5,xmm6               ; xmm5=tmp7
+
+       psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
+       pmulhw  xmm2,[PW_F1414] ; xmm2=tmp11
+
+       ; To avoid overflow...
+       ;
+       ; (Original)
+       ; tmp12 = -2.613125930 * z10 + z5;
+       ;
+       ; (This implementation)
+       ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+       ;       = -1.613125930 * z10 - z10 + z5;
+
+       movdqa  xmm6,xmm4
+       paddw   xmm4,xmm0
+       pmulhw  xmm4,[PW_F1847] ; xmm4=z5
+       pmulhw  xmm6,[PW_MF1613]
+       pmulhw  xmm0,[PW_F1082]
+       psubw   xmm6,xmm7
+       psubw   xmm0,xmm4               ; xmm0=tmp10
+       paddw   xmm6,xmm4               ; xmm6=tmp12
+
+       ; -- Final output stage
+
+       psubw   xmm6,xmm5               ; xmm6=tmp6
+       movdqa  xmm7,xmm1
+       movdqa  xmm4,xmm3
+       paddw   xmm1,xmm5               ; xmm1=data0=(00 10 20 30 40 50 60 70)
+       paddw   xmm3,xmm6               ; xmm3=data1=(01 11 21 31 41 51 61 71)
+       psraw   xmm1,(PASS1_BITS+3)     ; descale
+       psraw   xmm3,(PASS1_BITS+3)     ; descale
+       psubw   xmm7,xmm5               ; xmm7=data7=(07 17 27 37 47 57 67 77)
+       psubw   xmm4,xmm6               ; xmm4=data6=(06 16 26 36 46 56 66 76)
+       psraw   xmm7,(PASS1_BITS+3)     ; descale
+       psraw   xmm4,(PASS1_BITS+3)     ; descale
+       psubw   xmm2,xmm6               ; xmm2=tmp5
+
+       packsswb  xmm1,xmm4     ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+       packsswb  xmm3,xmm7     ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+       movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp2
+       movdqa  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp3
+
+       paddw   xmm0,xmm2               ; xmm0=tmp4
+       movdqa  xmm4,xmm5
+       movdqa  xmm7,xmm6
+       paddw   xmm5,xmm2               ; xmm5=data2=(02 12 22 32 42 52 62 72)
+       paddw   xmm6,xmm0               ; xmm6=data4=(04 14 24 34 44 54 64 74)
+       psraw   xmm5,(PASS1_BITS+3)     ; descale
+       psraw   xmm6,(PASS1_BITS+3)     ; descale
+       psubw   xmm4,xmm2               ; xmm4=data5=(05 15 25 35 45 55 65 75)
+       psubw   xmm7,xmm0               ; xmm7=data3=(03 13 23 33 43 53 63 73)
+       psraw   xmm4,(PASS1_BITS+3)     ; descale
+       psraw   xmm7,(PASS1_BITS+3)     ; descale
+
+       movdqa    xmm2,[PB_CENTERJSAMP] ; xmm2=[PB_CENTERJSAMP]
+
+       packsswb  xmm5,xmm6     ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+       packsswb  xmm7,xmm4     ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+       paddb     xmm1,xmm2
+       paddb     xmm3,xmm2
+       paddb     xmm5,xmm2
+       paddb     xmm7,xmm2
+
+       movdqa    xmm0,xmm1     ; transpose coefficients(phase 1)
+       punpcklbw xmm1,xmm3     ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+       punpckhbw xmm0,xmm3     ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+       movdqa    xmm6,xmm5     ; transpose coefficients(phase 1)
+       punpcklbw xmm5,xmm7     ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+       punpckhbw xmm6,xmm7     ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+       movdqa    xmm4,xmm1     ; transpose coefficients(phase 2)
+       punpcklwd xmm1,xmm5     ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+       punpckhwd xmm4,xmm5     ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+       movdqa    xmm2,xmm6     ; transpose coefficients(phase 2)
+       punpcklwd xmm6,xmm0     ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+       punpckhwd xmm2,xmm0     ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+       movdqa    xmm3,xmm1     ; transpose coefficients(phase 3)
+       punpckldq xmm1,xmm6     ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+       punpckhdq xmm3,xmm6     ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+       movdqa    xmm7,xmm4     ; transpose coefficients(phase 3)
+       punpckldq xmm4,xmm2     ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+       punpckhdq xmm7,xmm2     ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+       pshufd  xmm5,xmm1,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+       pshufd  xmm0,xmm3,0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+       pshufd  xmm6,xmm4,0x4E  ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+       pshufd  xmm2,xmm7,0x4E  ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+       mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+       mov     rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+       movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
+       movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+       mov     rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+       mov     rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+       movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+       movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
+
+       mov     rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+       mov     rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+       movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
+       movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
+       mov     rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+       mov     rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+       movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+       movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
+
+       uncollect_args
+       mov     rsp,rbp         ; rsp <- aligned rbp
+       pop     rsp             ; rsp <- original rbp
+       pop     rbp
+       ret
+       ret
diff --git a/common/jpeg/simd/jiss2int-64.asm b/common/jpeg/simd/jiss2int-64.asm
new file mode 100644 (file)
index 0000000..cfeb42d
--- /dev/null
@@ -0,0 +1,844 @@
+;
+; jiss2int.asm - accurate integer IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slow-but-accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS     13
+%define PASS1_BITS     2
+
+%define DESCALE_P1     (CONST_BITS-PASS1_BITS)
+%define DESCALE_P2     (CONST_BITS+PASS1_BITS+3)
+
+%if CONST_BITS == 13
+F_0_298        equ      2446           ; FIX(0.298631336)
+F_0_390        equ      3196           ; FIX(0.390180644)
+F_0_541        equ      4433           ; FIX(0.541196100)
+F_0_765        equ      6270           ; FIX(0.765366865)
+F_0_899        equ      7373           ; FIX(0.899976223)
+F_1_175        equ      9633           ; FIX(1.175875602)
+F_1_501        equ     12299           ; FIX(1.501321110)
+F_1_847        equ     15137           ; FIX(1.847759065)
+F_1_961        equ     16069           ; FIX(1.961570560)
+F_2_053        equ     16819           ; FIX(2.053119869)
+F_2_562        equ     20995           ; FIX(2.562915447)
+F_3_072        equ     25172           ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_298        equ     DESCALE( 320652955,30-CONST_BITS)       ; FIX(0.298631336)
+F_0_390        equ     DESCALE( 418953276,30-CONST_BITS)       ; FIX(0.390180644)
+F_0_541        equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
+F_0_765        equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
+F_0_899        equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
+F_1_175        equ     DESCALE(1262586813,30-CONST_BITS)       ; FIX(1.175875602)
+F_1_501        equ     DESCALE(1612031267,30-CONST_BITS)       ; FIX(1.501321110)
+F_1_847        equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
+F_1_961        equ     DESCALE(2106220350,30-CONST_BITS)       ; FIX(1.961570560)
+F_2_053        equ     DESCALE(2204520673,30-CONST_BITS)       ; FIX(2.053119869)
+F_2_562        equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
+F_3_072        equ     DESCALE(3299298341,30-CONST_BITS)       ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+       SECTION SEG_CONST
+
+       alignz  16
+       global  EXTN(jconst_idct_islow_sse2)
+
+EXTN(jconst_idct_islow_sse2):
+
+PW_F130_F054   times 4 dw  (F_0_541+F_0_765), F_0_541
+PW_F054_MF130  times 4 dw  F_0_541, (F_0_541-F_1_847)
+PW_MF078_F117  times 4 dw  (F_1_175-F_1_961), F_1_175
+PW_F117_F078   times 4 dw  F_1_175, (F_1_175-F_0_390)
+PW_MF060_MF089 times 4 dw  (F_0_298-F_0_899),-F_0_899
+PW_MF089_F060  times 4 dw -F_0_899, (F_1_501-F_0_899)
+PW_MF050_MF256 times 4 dw  (F_2_053-F_2_562),-F_2_562
+PW_MF256_F050  times 4 dw -F_2_562, (F_3_072-F_2_562)
+PD_DESCALE_P1  times 4 dd  1 << (DESCALE_P1-1)
+PD_DESCALE_P2  times 4 dd  1 << (DESCALE_P2-1)
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+       alignz  16
+
+; --------------------------------------------------------------------------
+       SECTION SEG_TEXT
+       BITS    64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_sse2 (void * dct_table, JCOEFPTR coef_block,
+;                        JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info * compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13 = JDIMENSION output_col
+
+%define original_rbp   rbp+0
+%define wk(i)          rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM         12
+
+       align   16
+       global  EXTN(jsimd_idct_islow_sse2)
+
+EXTN(jsimd_idct_islow_sse2):
+       push    rbp
+       mov     rax,rsp                         ; rax = original rbp
+       sub     rsp, byte 4
+       and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+       mov     [rsp],rax
+       mov     rbp,rsp                         ; rbp = aligned rbp
+       lea     rsp, [wk(0)]
+       collect_args
+
+       ; ---- Pass 1: process columns from input.
+
+       mov     rdx, r10        ; quantptr
+       mov     rsi, r11                ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
+       mov     eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+       or      eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+       jnz     near .columnDCT
+
+       movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+       movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+       por     xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+       por     xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+       por     xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+       por     xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+       por     xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+       por     xmm1,xmm0
+       packsswb xmm1,xmm1
+       packsswb xmm1,xmm1
+       movd    eax,xmm1
+       test    rax,rax
+       jnz     short .columnDCT
+
+       ; -- AC terms all zero
+
+       movdqa  xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+       pmullw  xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+       psllw   xmm5,PASS1_BITS
+
+       movdqa    xmm4,xmm5             ; xmm5=in0=(00 01 02 03 04 05 06 07)
+       punpcklwd xmm5,xmm5             ; xmm5=(00 00 01 01 02 02 03 03)
+       punpckhwd xmm4,xmm4             ; xmm4=(04 04 05 05 06 06 07 07)
+
+       pshufd  xmm7,xmm5,0x00          ; xmm7=col0=(00 00 00 00 00 00 00 00)
+       pshufd  xmm6,xmm5,0x55          ; xmm6=col1=(01 01 01 01 01 01 01 01)
+       pshufd  xmm1,xmm5,0xAA          ; xmm1=col2=(02 02 02 02 02 02 02 02)
+       pshufd  xmm5,xmm5,0xFF          ; xmm5=col3=(03 03 03 03 03 03 03 03)
+       pshufd  xmm0,xmm4,0x00          ; xmm0=col4=(04 04 04 04 04 04 04 04)
+       pshufd  xmm3,xmm4,0x55          ; xmm3=col5=(05 05 05 05 05 05 05 05)
+       pshufd  xmm2,xmm4,0xAA          ; xmm2=col6=(06 06 06 06 06 06 06 06)
+       pshufd  xmm4,xmm4,0xFF          ; xmm4=col7=(07 07 07 07 07 07 07 07)
+
+       movdqa  XMMWORD [wk(8)], xmm6   ; wk(8)=col1
+       movdqa  XMMWORD [wk(9)], xmm5   ; wk(9)=col3
+       movdqa  XMMWORD [wk(10)], xmm3  ; wk(10)=col5
+       movdqa  XMMWORD [wk(11)], xmm4  ; wk(11)=col7
+       jmp     near .column_end
+%endif
+.columnDCT:
+
+       ; -- Even part
+
+       movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+       movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+       pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+       pmullw  xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+       movdqa  xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+       movdqa  xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+       pmullw  xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+       pmullw  xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+       ; (Original)
+       ; z1 = (z2 + z3) * 0.541196100;
+       ; tmp2 = z1 + z3 * -1.847759065;
+       ; tmp3 = z1 + z2 * 0.765366865;
+       ;
+       ; (This implementation)
+       ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+       ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+       movdqa    xmm4,xmm1             ; xmm1=in2=z2
+       movdqa    xmm5,xmm1
+       punpcklwd xmm4,xmm3             ; xmm3=in6=z3
+       punpckhwd xmm5,xmm3
+       movdqa    xmm1,xmm4
+       movdqa    xmm3,xmm5
+       pmaddwd   xmm4,[PW_F130_F054]   ; xmm4=tmp3L
+       pmaddwd   xmm5,[PW_F130_F054]   ; xmm5=tmp3H
+       pmaddwd   xmm1,[PW_F054_MF130]  ; xmm1=tmp2L
+       pmaddwd   xmm3,[PW_F054_MF130]  ; xmm3=tmp2H
+
+       movdqa    xmm6,xmm0
+       paddw     xmm0,xmm2             ; xmm0=in0+in4
+       psubw     xmm6,xmm2             ; xmm6=in0-in4
+
+       pxor      xmm7,xmm7
+       pxor      xmm2,xmm2
+       punpcklwd xmm7,xmm0             ; xmm7=tmp0L
+       punpckhwd xmm2,xmm0             ; xmm2=tmp0H
+       psrad     xmm7,(16-CONST_BITS)  ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+       psrad     xmm2,(16-CONST_BITS)  ; psrad xmm2,16 & pslld xmm2,CONST_BITS
+
+       movdqa  xmm0,xmm7
+       paddd   xmm7,xmm4               ; xmm7=tmp10L
+       psubd   xmm0,xmm4               ; xmm0=tmp13L
+       movdqa  xmm4,xmm2
+       paddd   xmm2,xmm5               ; xmm2=tmp10H
+       psubd   xmm4,xmm5               ; xmm4=tmp13H
+
+       movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp10L
+       movdqa  XMMWORD [wk(1)], xmm2   ; wk(1)=tmp10H
+       movdqa  XMMWORD [wk(2)], xmm0   ; wk(2)=tmp13L
+       movdqa  XMMWORD [wk(3)], xmm4   ; wk(3)=tmp13H
+
+       pxor      xmm5,xmm5
+       pxor      xmm7,xmm7
+       punpcklwd xmm5,xmm6             ; xmm5=tmp1L
+       punpckhwd xmm7,xmm6             ; xmm7=tmp1H
+       psrad     xmm5,(16-CONST_BITS)  ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+       psrad     xmm7,(16-CONST_BITS)  ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+
+       movdqa  xmm2,xmm5
+       paddd   xmm5,xmm1               ; xmm5=tmp11L
+       psubd   xmm2,xmm1               ; xmm2=tmp12L
+       movdqa  xmm0,xmm7
+       paddd   xmm7,xmm3               ; xmm7=tmp11H
+       psubd   xmm0,xmm3               ; xmm0=tmp12H
+
+       movdqa  XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
+       movdqa  XMMWORD [wk(5)], xmm7   ; wk(5)=tmp11H
+       movdqa  XMMWORD [wk(6)], xmm2   ; wk(6)=tmp12L
+       movdqa  XMMWORD [wk(7)], xmm0   ; wk(7)=tmp12H
+
+       ; -- Odd part
+
+       movdqa  xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+       movdqa  xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+       pmullw  xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+       pmullw  xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+       movdqa  xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+       movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+       pmullw  xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+       pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+       movdqa  xmm5,xmm6
+       movdqa  xmm7,xmm4
+       paddw   xmm5,xmm3               ; xmm5=z3
+       paddw   xmm7,xmm1               ; xmm7=z4
+
+       ; (Original)
+       ; z5 = (z3 + z4) * 1.175875602;
+       ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+       ; z3 += z5;  z4 += z5;
+       ;
+       ; (This implementation)
+       ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+       ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+       movdqa    xmm2,xmm5
+       movdqa    xmm0,xmm5
+       punpcklwd xmm2,xmm7
+       punpckhwd xmm0,xmm7
+       movdqa    xmm5,xmm2
+       movdqa    xmm7,xmm0
+       pmaddwd   xmm2,[PW_MF078_F117]  ; xmm2=z3L
+       pmaddwd   xmm0,[PW_MF078_F117]  ; xmm0=z3H
+       pmaddwd   xmm5,[PW_F117_F078]   ; xmm5=z4L
+       pmaddwd   xmm7,[PW_F117_F078]   ; xmm7=z4H
+
+       movdqa  XMMWORD [wk(10)], xmm2  ; wk(10)=z3L
+       movdqa  XMMWORD [wk(11)], xmm0  ; wk(11)=z3H
+
+       ; (Original)
+       ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+       ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+       ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+       ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+       ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+       ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+       ;
+       ; (This implementation)
+       ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+       ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+       ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+       ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+       ; tmp0 += z3;  tmp1 += z4;
+       ; tmp2 += z3;  tmp3 += z4;
+
+       movdqa    xmm2,xmm3
+       movdqa    xmm0,xmm3
+       punpcklwd xmm2,xmm4
+       punpckhwd xmm0,xmm4
+       movdqa    xmm3,xmm2
+       movdqa    xmm4,xmm0
+       pmaddwd   xmm2,[PW_MF060_MF089] ; xmm2=tmp0L
+       pmaddwd   xmm0,[PW_MF060_MF089] ; xmm0=tmp0H
+       pmaddwd   xmm3,[PW_MF089_F060]  ; xmm3=tmp3L
+       pmaddwd   xmm4,[PW_MF089_F060]  ; xmm4=tmp3H
+
+       paddd   xmm2, XMMWORD [wk(10)]  ; xmm2=tmp0L
+       paddd   xmm0, XMMWORD [wk(11)]  ; xmm0=tmp0H
+       paddd   xmm3,xmm5               ; xmm3=tmp3L
+       paddd   xmm4,xmm7               ; xmm4=tmp3H
+
+       movdqa  XMMWORD [wk(8)], xmm2   ; wk(8)=tmp0L
+       movdqa  XMMWORD [wk(9)], xmm0   ; wk(9)=tmp0H
+
+       movdqa    xmm2,xmm1
+       movdqa    xmm0,xmm1
+       punpcklwd xmm2,xmm6
+       punpckhwd xmm0,xmm6
+       movdqa    xmm1,xmm2
+       movdqa    xmm6,xmm0
+       pmaddwd   xmm2,[PW_MF050_MF256] ; xmm2=tmp1L
+       pmaddwd   xmm0,[PW_MF050_MF256] ; xmm0=tmp1H
+       pmaddwd   xmm1,[PW_MF256_F050]  ; xmm1=tmp2L
+       pmaddwd   xmm6,[PW_MF256_F050]  ; xmm6=tmp2H
+
+       paddd   xmm2,xmm5               ; xmm2=tmp1L
+       paddd   xmm0,xmm7               ; xmm0=tmp1H
+       paddd   xmm1, XMMWORD [wk(10)]  ; xmm1=tmp2L
+       paddd   xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
+
+       movdqa  XMMWORD [wk(10)], xmm2  ; wk(10)=tmp1L
+       movdqa  XMMWORD [wk(11)], xmm0  ; wk(11)=tmp1H
+
+       ; -- Final output stage
+
+       movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
+       movdqa  xmm7, XMMWORD [wk(1)]   ; xmm7=tmp10H
+
+       movdqa  xmm2,xmm5
+       movdqa  xmm0,xmm7
+       paddd   xmm5,xmm3               ; xmm5=data0L
+       paddd   xmm7,xmm4               ; xmm7=data0H
+       psubd   xmm2,xmm3               ; xmm2=data7L
+       psubd   xmm0,xmm4               ; xmm0=data7H
+
+       movdqa  xmm3,[PD_DESCALE_P1]    ; xmm3=[PD_DESCALE_P1]
+
+       paddd   xmm5,xmm3
+       paddd   xmm7,xmm3
+       psrad   xmm5,DESCALE_P1
+       psrad   xmm7,DESCALE_P1
+       paddd   xmm2,xmm3
+       paddd   xmm0,xmm3
+       psrad   xmm2,DESCALE_P1
+       psrad   xmm0,DESCALE_P1
+
+       packssdw  xmm5,xmm7             ; xmm5=data0=(00 01 02 03 04 05 06 07)
+       packssdw  xmm2,xmm0             ; xmm2=data7=(70 71 72 73 74 75 76 77)
+
+       movdqa  xmm4, XMMWORD [wk(4)]   ; xmm4=tmp11L
+       movdqa  xmm3, XMMWORD [wk(5)]   ; xmm3=tmp11H
+
+       movdqa  xmm7,xmm4
+       movdqa  xmm0,xmm3
+       paddd   xmm4,xmm1               ; xmm4=data1L
+       paddd   xmm3,xmm6               ; xmm3=data1H
+       psubd   xmm7,xmm1               ; xmm7=data6L
+       psubd   xmm0,xmm6               ; xmm0=data6H
+
+       movdqa  xmm1,[PD_DESCALE_P1]    ; xmm1=[PD_DESCALE_P1]
+
+       paddd   xmm4,xmm1
+       paddd   xmm3,xmm1
+       psrad   xmm4,DESCALE_P1
+       psrad   xmm3,DESCALE_P1
+       paddd   xmm7,xmm1
+       paddd   xmm0,xmm1
+       psrad   xmm7,DESCALE_P1
+       psrad   xmm0,DESCALE_P1
+
+       packssdw  xmm4,xmm3             ; xmm4=data1=(10 11 12 13 14 15 16 17)
+       packssdw  xmm7,xmm0             ; xmm7=data6=(60 61 62 63 64 65 66 67)
+
+       movdqa    xmm6,xmm5             ; transpose coefficients(phase 1)
+       punpcklwd xmm5,xmm4             ; xmm5=(00 10 01 11 02 12 03 13)
+       punpckhwd xmm6,xmm4             ; xmm6=(04 14 05 15 06 16 07 17)
+       movdqa    xmm1,xmm7             ; transpose coefficients(phase 1)
+       punpcklwd xmm7,xmm2             ; xmm7=(60 70 61 71 62 72 63 73)
+       punpckhwd xmm1,xmm2             ; xmm1=(64 74 65 75 66 76 67 77)
+
+       movdqa  xmm3, XMMWORD [wk(6)]   ; xmm3=tmp12L
+       movdqa  xmm0, XMMWORD [wk(7)]   ; xmm0=tmp12H
+       movdqa  xmm4, XMMWORD [wk(10)]  ; xmm4=tmp1L
+       movdqa  xmm2, XMMWORD [wk(11)]  ; xmm2=tmp1H
+
+       movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 01 11 02 12 03 13)
+       movdqa  XMMWORD [wk(1)], xmm6   ; wk(1)=(04 14 05 15 06 16 07 17)
+       movdqa  XMMWORD [wk(4)], xmm7   ; wk(4)=(60 70 61 71 62 72 63 73)
+       movdqa  XMMWORD [wk(5)], xmm1   ; wk(5)=(64 74 65 75 66 76 67 77)
+
+       movdqa  xmm5,xmm3
+       movdqa  xmm6,xmm0
+       paddd   xmm3,xmm4               ; xmm3=data2L
+       paddd   xmm0,xmm2               ; xmm0=data2H
+       psubd   xmm5,xmm4               ; xmm5=data5L
+       psubd   xmm6,xmm2               ; xmm6=data5H
+
+       movdqa  xmm7,[PD_DESCALE_P1]    ; xmm7=[PD_DESCALE_P1]
+
+       paddd   xmm3,xmm7
+       paddd   xmm0,xmm7
+       psrad   xmm3,DESCALE_P1
+       psrad   xmm0,DESCALE_P1
+       paddd   xmm5,xmm7
+       paddd   xmm6,xmm7
+       psrad   xmm5,DESCALE_P1
+       psrad   xmm6,DESCALE_P1
+
+       packssdw  xmm3,xmm0             ; xmm3=data2=(20 21 22 23 24 25 26 27)
+       packssdw  xmm5,xmm6             ; xmm5=data5=(50 51 52 53 54 55 56 57)
+
+       movdqa  xmm1, XMMWORD [wk(2)]   ; xmm1=tmp13L
+       movdqa  xmm4, XMMWORD [wk(3)]   ; xmm4=tmp13H
+       movdqa  xmm2, XMMWORD [wk(8)]   ; xmm2=tmp0L
+       movdqa  xmm7, XMMWORD [wk(9)]   ; xmm7=tmp0H
+
+       movdqa  xmm0,xmm1
+       movdqa  xmm6,xmm4
+       paddd   xmm1,xmm2               ; xmm1=data3L
+       paddd   xmm4,xmm7               ; xmm4=data3H
+       psubd   xmm0,xmm2               ; xmm0=data4L
+       psubd   xmm6,xmm7               ; xmm6=data4H
+
+       movdqa  xmm2,[PD_DESCALE_P1]    ; xmm2=[PD_DESCALE_P1]
+
+       paddd   xmm1,xmm2
+       paddd   xmm4,xmm2
+       psrad   xmm1,DESCALE_P1
+       psrad   xmm4,DESCALE_P1
+       paddd   xmm0,xmm2
+       paddd   xmm6,xmm2
+       psrad   xmm0,DESCALE_P1
+       psrad   xmm6,DESCALE_P1
+
+       packssdw  xmm1,xmm4             ; xmm1=data3=(30 31 32 33 34 35 36 37)
+       packssdw  xmm0,xmm6             ; xmm0=data4=(40 41 42 43 44 45 46 47)
+
+       movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=(00 10 01 11 02 12 03 13)
+       movdqa  xmm2, XMMWORD [wk(1)]   ; xmm2=(04 14 05 15 06 16 07 17)
+
+       movdqa    xmm4,xmm3             ; transpose coefficients(phase 1)
+       punpcklwd xmm3,xmm1             ; xmm3=(20 30 21 31 22 32 23 33)
+       punpckhwd xmm4,xmm1             ; xmm4=(24 34 25 35 26 36 27 37)
+       movdqa    xmm6,xmm0             ; transpose coefficients(phase 1)
+       punpcklwd xmm0,xmm5             ; xmm0=(40 50 41 51 42 52 43 53)
+       punpckhwd xmm6,xmm5             ; xmm6=(44 54 45 55 46 56 47 57)
+
+       movdqa    xmm1,xmm7             ; transpose coefficients(phase 2)
+       punpckldq xmm7,xmm3             ; xmm7=(00 10 20 30 01 11 21 31)
+       punpckhdq xmm1,xmm3             ; xmm1=(02 12 22 32 03 13 23 33)
+       movdqa    xmm5,xmm2             ; transpose coefficients(phase 2)
+       punpckldq xmm2,xmm4             ; xmm2=(04 14 24 34 05 15 25 35)
+       punpckhdq xmm5,xmm4             ; xmm5=(06 16 26 36 07 17 27 37)
+
+       movdqa  xmm3, XMMWORD [wk(4)]   ; xmm3=(60 70 61 71 62 72 63 73)
+       movdqa  xmm4, XMMWORD [wk(5)]   ; xmm4=(64 74 65 75 66 76 67 77)
+
+       movdqa  XMMWORD [wk(6)], xmm2   ; wk(6)=(04 14 24 34 05 15 25 35)
+       movdqa  XMMWORD [wk(7)], xmm5   ; wk(7)=(06 16 26 36 07 17 27 37)
+
+       movdqa    xmm2,xmm0             ; transpose coefficients(phase 2)
+       punpckldq xmm0,xmm3             ; xmm0=(40 50 60 70 41 51 61 71)
+       punpckhdq xmm2,xmm3             ; xmm2=(42 52 62 72 43 53 63 73)
+       movdqa    xmm5,xmm6             ; transpose coefficients(phase 2)
+       punpckldq xmm6,xmm4             ; xmm6=(44 54 64 74 45 55 65 75)
+       punpckhdq xmm5,xmm4             ; xmm5=(46 56 66 76 47 57 67 77)
+
+       movdqa     xmm3,xmm7            ; transpose coefficients(phase 3)
+       punpcklqdq xmm7,xmm0            ; xmm7=col0=(00 10 20 30 40 50 60 70)
+       punpckhqdq xmm3,xmm0            ; xmm3=col1=(01 11 21 31 41 51 61 71)
+       movdqa     xmm4,xmm1            ; transpose coefficients(phase 3)
+       punpcklqdq xmm1,xmm2            ; xmm1=col2=(02 12 22 32 42 52 62 72)
+       punpckhqdq xmm4,xmm2            ; xmm4=col3=(03 13 23 33 43 53 63 73)
+
+       movdqa  xmm0, XMMWORD [wk(6)]   ; xmm0=(04 14 24 34 05 15 25 35)
+       movdqa  xmm2, XMMWORD [wk(7)]   ; xmm2=(06 16 26 36 07 17 27 37)
+
+       movdqa  XMMWORD [wk(8)], xmm3   ; wk(8)=col1
+       movdqa  XMMWORD [wk(9)], xmm4   ; wk(9)=col3
+
+       movdqa     xmm3,xmm0            ; transpose coefficients(phase 3)
+       punpcklqdq xmm0,xmm6            ; xmm0=col4=(04 14 24 34 44 54 64 74)
+       punpckhqdq xmm3,xmm6            ; xmm3=col5=(05 15 25 35 45 55 65 75)
+       movdqa     xmm4,xmm2            ; transpose coefficients(phase 3)
+       punpcklqdq xmm2,xmm5            ; xmm2=col6=(06 16 26 36 46 56 66 76)
+       punpckhqdq xmm4,xmm5            ; xmm4=col7=(07 17 27 37 47 57 67 77)
+
+       movdqa  XMMWORD [wk(10)], xmm3  ; wk(10)=col5
+       movdqa  XMMWORD [wk(11)], xmm4  ; wk(11)=col7
+.column_end:
+
+       ; -- Prefetch the next coefficient block
+
+       prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+       prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+       prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+       prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+       ; ---- Pass 2: process rows from work array, store into output array.
+
+       mov     rax, [original_rbp]
+       mov     rdi, r12        ; (JSAMPROW *)
+       mov     rax, r13
+
+       ; -- Even part
+
+       ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
+
+       ; (Original)
+       ; z1 = (z2 + z3) * 0.541196100;
+       ; tmp2 = z1 + z3 * -1.847759065;
+       ; tmp3 = z1 + z2 * 0.765366865;
+       ;
+       ; (This implementation)
+       ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+       ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+       movdqa    xmm6,xmm1             ; xmm1=in2=z2
+       movdqa    xmm5,xmm1
+       punpcklwd xmm6,xmm2             ; xmm2=in6=z3
+       punpckhwd xmm5,xmm2
+       movdqa    xmm1,xmm6
+       movdqa    xmm2,xmm5
+       pmaddwd   xmm6,[PW_F130_F054]   ; xmm6=tmp3L
+       pmaddwd   xmm5,[PW_F130_F054]   ; xmm5=tmp3H
+       pmaddwd   xmm1,[PW_F054_MF130]  ; xmm1=tmp2L
+       pmaddwd   xmm2,[PW_F054_MF130]  ; xmm2=tmp2H
+
+       movdqa    xmm3,xmm7
+       paddw     xmm7,xmm0             ; xmm7=in0+in4
+       psubw     xmm3,xmm0             ; xmm3=in0-in4
+
+       pxor      xmm4,xmm4
+       pxor      xmm0,xmm0
+       punpcklwd xmm4,xmm7             ; xmm4=tmp0L
+       punpckhwd xmm0,xmm7             ; xmm0=tmp0H
+       psrad     xmm4,(16-CONST_BITS)  ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+       psrad     xmm0,(16-CONST_BITS)  ; psrad xmm0,16 & pslld xmm0,CONST_BITS
+
+       movdqa  xmm7,xmm4
+       paddd   xmm4,xmm6               ; xmm4=tmp10L
+       psubd   xmm7,xmm6               ; xmm7=tmp13L
+       movdqa  xmm6,xmm0
+       paddd   xmm0,xmm5               ; xmm0=tmp10H
+       psubd   xmm6,xmm5               ; xmm6=tmp13H
+
+       movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=tmp10L
+       movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp10H
+       movdqa  XMMWORD [wk(2)], xmm7   ; wk(2)=tmp13L
+       movdqa  XMMWORD [wk(3)], xmm6   ; wk(3)=tmp13H
+
+       pxor      xmm5,xmm5
+       pxor      xmm4,xmm4
+       punpcklwd xmm5,xmm3             ; xmm5=tmp1L
+       punpckhwd xmm4,xmm3             ; xmm4=tmp1H
+       psrad     xmm5,(16-CONST_BITS)  ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+       psrad     xmm4,(16-CONST_BITS)  ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+
+       movdqa  xmm0,xmm5
+       paddd   xmm5,xmm1               ; xmm5=tmp11L
+       psubd   xmm0,xmm1               ; xmm0=tmp12L
+       movdqa  xmm7,xmm4
+       paddd   xmm4,xmm2               ; xmm4=tmp11H
+       psubd   xmm7,xmm2               ; xmm7=tmp12H
+
+       movdqa  XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
+       movdqa  XMMWORD [wk(5)], xmm4   ; wk(5)=tmp11H
+       movdqa  XMMWORD [wk(6)], xmm0   ; wk(6)=tmp12L
+       movdqa  XMMWORD [wk(7)], xmm7   ; wk(7)=tmp12H
+
+       ; -- Odd part
+
+       movdqa  xmm6, XMMWORD [wk(9)]   ; xmm6=col3
+       movdqa  xmm3, XMMWORD [wk(8)]   ; xmm3=col1
+       movdqa  xmm1, XMMWORD [wk(11)]  ; xmm1=col7
+       movdqa  xmm2, XMMWORD [wk(10)]  ; xmm2=col5
+
+       movdqa  xmm5,xmm6
+       movdqa  xmm4,xmm3
+       paddw   xmm5,xmm1               ; xmm5=z3
+       paddw   xmm4,xmm2               ; xmm4=z4
+
+       ; (Original)
+       ; z5 = (z3 + z4) * 1.175875602;
+       ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+       ; z3 += z5;  z4 += z5;
+       ;
+       ; (This implementation)
+       ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+       ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+       movdqa    xmm0,xmm5
+       movdqa    xmm7,xmm5
+       punpcklwd xmm0,xmm4
+       punpckhwd xmm7,xmm4
+       movdqa    xmm5,xmm0
+       movdqa    xmm4,xmm7
+       pmaddwd   xmm0,[PW_MF078_F117]  ; xmm0=z3L
+       pmaddwd   xmm7,[PW_MF078_F117]  ; xmm7=z3H
+       pmaddwd   xmm5,[PW_F117_F078]   ; xmm5=z4L
+       pmaddwd   xmm4,[PW_F117_F078]   ; xmm4=z4H
+
+       movdqa  XMMWORD [wk(10)], xmm0  ; wk(10)=z3L
+       movdqa  XMMWORD [wk(11)], xmm7  ; wk(11)=z3H
+
+       ; (Original)
+       ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+       ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+       ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+       ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+       ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+       ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+       ;
+       ; (This implementation)
+       ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+       ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+       ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+       ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+       ; tmp0 += z3;  tmp1 += z4;
+       ; tmp2 += z3;  tmp3 += z4;
+
+       movdqa    xmm0,xmm1
+       movdqa    xmm7,xmm1
+       punpcklwd xmm0,xmm3
+       punpckhwd xmm7,xmm3
+       movdqa    xmm1,xmm0
+       movdqa    xmm3,xmm7
+       pmaddwd   xmm0,[PW_MF060_MF089] ; xmm0=tmp0L
+       pmaddwd   xmm7,[PW_MF060_MF089] ; xmm7=tmp0H
+       pmaddwd   xmm1,[PW_MF089_F060]  ; xmm1=tmp3L
+       pmaddwd   xmm3,[PW_MF089_F060]  ; xmm3=tmp3H
+
+       paddd   xmm0, XMMWORD [wk(10)]  ; xmm0=tmp0L
+       paddd   xmm7, XMMWORD [wk(11)]  ; xmm7=tmp0H
+       paddd   xmm1,xmm5               ; xmm1=tmp3L
+       paddd   xmm3,xmm4               ; xmm3=tmp3H
+
+       movdqa  XMMWORD [wk(8)], xmm0   ; wk(8)=tmp0L
+       movdqa  XMMWORD [wk(9)], xmm7   ; wk(9)=tmp0H
+
+       movdqa    xmm0,xmm2
+       movdqa    xmm7,xmm2
+       punpcklwd xmm0,xmm6
+       punpckhwd xmm7,xmm6
+       movdqa    xmm2,xmm0
+       movdqa    xmm6,xmm7
+       pmaddwd   xmm0,[PW_MF050_MF256] ; xmm0=tmp1L
+       pmaddwd   xmm7,[PW_MF050_MF256] ; xmm7=tmp1H
+       pmaddwd   xmm2,[PW_MF256_F050]  ; xmm2=tmp2L
+       pmaddwd   xmm6,[PW_MF256_F050]  ; xmm6=tmp2H
+
+       paddd   xmm0,xmm5               ; xmm0=tmp1L
+       paddd   xmm7,xmm4               ; xmm7=tmp1H
+       paddd   xmm2, XMMWORD [wk(10)]  ; xmm2=tmp2L
+       paddd   xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
+
+       movdqa  XMMWORD [wk(10)], xmm0  ; wk(10)=tmp1L
+       movdqa  XMMWORD [wk(11)], xmm7  ; wk(11)=tmp1H
+
+       ; -- Final output stage
+
+       movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
+       movdqa  xmm4, XMMWORD [wk(1)]   ; xmm4=tmp10H
+
+       movdqa  xmm0,xmm5
+       movdqa  xmm7,xmm4
+       paddd   xmm5,xmm1               ; xmm5=data0L
+       paddd   xmm4,xmm3               ; xmm4=data0H
+       psubd   xmm0,xmm1               ; xmm0=data7L
+       psubd   xmm7,xmm3               ; xmm7=data7H
+
+       movdqa  xmm1,[PD_DESCALE_P2]    ; xmm1=[PD_DESCALE_P2]
+
+       paddd   xmm5,xmm1
+       paddd   xmm4,xmm1
+       psrad   xmm5,DESCALE_P2
+       psrad   xmm4,DESCALE_P2
+       paddd   xmm0,xmm1
+       paddd   xmm7,xmm1
+       psrad   xmm0,DESCALE_P2
+       psrad   xmm7,DESCALE_P2
+
+       packssdw  xmm5,xmm4             ; xmm5=data0=(00 10 20 30 40 50 60 70)
+       packssdw  xmm0,xmm7             ; xmm0=data7=(07 17 27 37 47 57 67 77)
+
+       movdqa  xmm3, XMMWORD [wk(4)]   ; xmm3=tmp11L
+       movdqa  xmm1, XMMWORD [wk(5)]   ; xmm1=tmp11H
+
+       movdqa  xmm4,xmm3
+       movdqa  xmm7,xmm1
+       paddd   xmm3,xmm2               ; xmm3=data1L
+       paddd   xmm1,xmm6               ; xmm1=data1H
+       psubd   xmm4,xmm2               ; xmm4=data6L
+       psubd   xmm7,xmm6               ; xmm7=data6H
+
+       movdqa  xmm2,[PD_DESCALE_P2]    ; xmm2=[PD_DESCALE_P2]
+
+       paddd   xmm3,xmm2
+       paddd   xmm1,xmm2
+       psrad   xmm3,DESCALE_P2
+       psrad   xmm1,DESCALE_P2
+       paddd   xmm4,xmm2
+       paddd   xmm7,xmm2
+       psrad   xmm4,DESCALE_P2
+       psrad   xmm7,DESCALE_P2
+
+       packssdw  xmm3,xmm1             ; xmm3=data1=(01 11 21 31 41 51 61 71)
+       packssdw  xmm4,xmm7             ; xmm4=data6=(06 16 26 36 46 56 66 76)
+
+       packsswb  xmm5,xmm4             ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+       packsswb  xmm3,xmm0             ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+       movdqa  xmm6, XMMWORD [wk(6)]   ; xmm6=tmp12L
+       movdqa  xmm2, XMMWORD [wk(7)]   ; xmm2=tmp12H
+       movdqa  xmm1, XMMWORD [wk(10)]  ; xmm1=tmp1L
+       movdqa  xmm7, XMMWORD [wk(11)]  ; xmm7=tmp1H
+
+       movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+       movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+       movdqa  xmm4,xmm6
+       movdqa  xmm0,xmm2
+       paddd   xmm6,xmm1               ; xmm6=data2L
+       paddd   xmm2,xmm7               ; xmm2=data2H
+       psubd   xmm4,xmm1               ; xmm4=data5L
+       psubd   xmm0,xmm7               ; xmm0=data5H
+
+       movdqa  xmm5,[PD_DESCALE_P2]    ; xmm5=[PD_DESCALE_P2]
+
+       paddd   xmm6,xmm5
+       paddd   xmm2,xmm5
+       psrad   xmm6,DESCALE_P2
+       psrad   xmm2,DESCALE_P2
+       paddd   xmm4,xmm5
+       paddd   xmm0,xmm5
+       psrad   xmm4,DESCALE_P2
+       psrad   xmm0,DESCALE_P2
+
+       packssdw  xmm6,xmm2             ; xmm6=data2=(02 12 22 32 42 52 62 72)
+       packssdw  xmm4,xmm0             ; xmm4=data5=(05 15 25 35 45 55 65 75)
+
+       movdqa  xmm3, XMMWORD [wk(2)]   ; xmm3=tmp13L
+       movdqa  xmm1, XMMWORD [wk(3)]   ; xmm1=tmp13H
+       movdqa  xmm7, XMMWORD [wk(8)]   ; xmm7=tmp0L
+       movdqa  xmm5, XMMWORD [wk(9)]   ; xmm5=tmp0H
+
+       movdqa  xmm2,xmm3
+       movdqa  xmm0,xmm1
+       paddd   xmm3,xmm7               ; xmm3=data3L
+       paddd   xmm1,xmm5               ; xmm1=data3H
+       psubd   xmm2,xmm7               ; xmm2=data4L
+       psubd   xmm0,xmm5               ; xmm0=data4H
+
+       movdqa  xmm7,[PD_DESCALE_P2]    ; xmm7=[PD_DESCALE_P2]
+
+       paddd   xmm3,xmm7
+       paddd   xmm1,xmm7
+       psrad   xmm3,DESCALE_P2
+       psrad   xmm1,DESCALE_P2
+       paddd   xmm2,xmm7
+       paddd   xmm0,xmm7
+       psrad   xmm2,DESCALE_P2
+       psrad   xmm0,DESCALE_P2
+
+       movdqa    xmm5,[PB_CENTERJSAMP] ; xmm5=[PB_CENTERJSAMP]
+
+       packssdw  xmm3,xmm1             ; xmm3=data3=(03 13 23 33 43 53 63 73)
+       packssdw  xmm2,xmm0             ; xmm2=data4=(04 14 24 34 44 54 64 74)
+
+       movdqa    xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+       movdqa    xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+       packsswb  xmm6,xmm2             ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+       packsswb  xmm3,xmm4             ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+       paddb     xmm7,xmm5
+       paddb     xmm1,xmm5
+       paddb     xmm6,xmm5
+       paddb     xmm3,xmm5
+
+       movdqa    xmm0,xmm7     ; transpose coefficients(phase 1)
+       punpcklbw xmm7,xmm1     ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+       punpckhbw xmm0,xmm1     ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+       movdqa    xmm2,xmm6     ; transpose coefficients(phase 1)
+       punpcklbw xmm6,xmm3     ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+       punpckhbw xmm2,xmm3     ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+       movdqa    xmm4,xmm7     ; transpose coefficients(phase 2)
+       punpcklwd xmm7,xmm6     ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+       punpckhwd xmm4,xmm6     ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+       movdqa    xmm5,xmm2     ; transpose coefficients(phase 2)
+       punpcklwd xmm2,xmm0     ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+       punpckhwd xmm5,xmm0     ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+       movdqa    xmm1,xmm7     ; transpose coefficients(phase 3)
+       punpckldq xmm7,xmm2     ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+       punpckhdq xmm1,xmm2     ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+       movdqa    xmm3,xmm4     ; transpose coefficients(phase 3)
+       punpckldq xmm4,xmm5     ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+       punpckhdq xmm3,xmm5     ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+       pshufd  xmm6,xmm7,0x4E  ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+       pshufd  xmm0,xmm1,0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+       pshufd  xmm2,xmm4,0x4E  ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+       pshufd  xmm5,xmm3,0x4E  ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+       mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+       mov     rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+       movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7
+       movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
+       mov     rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+       mov     rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+       movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+       movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+       mov     rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+       mov     rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+       movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+       movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
+       mov     rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+       mov     rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+       movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
+       movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
+
+       uncollect_args
+       mov     rsp,rbp         ; rsp <- aligned rbp
+       pop     rsp             ; rsp <- original rbp
+       pop     rbp
+       ret
diff --git a/common/jpeg/simd/jiss2red-64.asm b/common/jpeg/simd/jiss2red-64.asm
new file mode 100644 (file)
index 0000000..381180c
--- /dev/null
@@ -0,0 +1,571 @@
+;
+; jiss2red.asm - reduced-size IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS     13
+%define PASS1_BITS     2
+
+%define DESCALE_P1_4   (CONST_BITS-PASS1_BITS+1)
+%define DESCALE_P2_4   (CONST_BITS+PASS1_BITS+3+1)
+%define DESCALE_P1_2   (CONST_BITS-PASS1_BITS+2)
+%define DESCALE_P2_2   (CONST_BITS+PASS1_BITS+3+2)
+
+%if CONST_BITS == 13
+F_0_211        equ      1730           ; FIX(0.211164243)
+F_0_509        equ      4176           ; FIX(0.509795579)
+F_0_601        equ      4926           ; FIX(0.601344887)
+F_0_720        equ      5906           ; FIX(0.720959822)
+F_0_765        equ      6270           ; FIX(0.765366865)
+F_0_850        equ      6967           ; FIX(0.850430095)
+F_0_899        equ      7373           ; FIX(0.899976223)
+F_1_061        equ      8697           ; FIX(1.061594337)
+F_1_272        equ     10426           ; FIX(1.272758580)
+F_1_451        equ     11893           ; FIX(1.451774981)
+F_1_847        equ     15137           ; FIX(1.847759065)
+F_2_172        equ     17799           ; FIX(2.172734803)
+F_2_562        equ     20995           ; FIX(2.562915447)
+F_3_624        equ     29692           ; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
+F_0_211        equ     DESCALE( 226735879,30-CONST_BITS)       ; FIX(0.211164243)
+F_0_509        equ     DESCALE( 547388834,30-CONST_BITS)       ; FIX(0.509795579)
+F_0_601        equ     DESCALE( 645689155,30-CONST_BITS)       ; FIX(0.601344887)
+F_0_720        equ     DESCALE( 774124714,30-CONST_BITS)       ; FIX(0.720959822)
+F_0_765        equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
+F_0_850        equ     DESCALE( 913142361,30-CONST_BITS)       ; FIX(0.850430095)
+F_0_899        equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
+F_1_061        equ     DESCALE(1139878239,30-CONST_BITS)       ; FIX(1.061594337)
+F_1_272        equ     DESCALE(1366614119,30-CONST_BITS)       ; FIX(1.272758580)
+F_1_451        equ     DESCALE(1558831516,30-CONST_BITS)       ; FIX(1.451774981)
+F_1_847        equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
+F_2_172        equ     DESCALE(2332956230,30-CONST_BITS)       ; FIX(2.172734803)
+F_2_562        equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
+F_3_624        equ     DESCALE(3891787747,30-CONST_BITS)       ; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+       SECTION SEG_CONST
+
+       alignz  16
+       global  EXTN(jconst_idct_red_sse2)
+
+EXTN(jconst_idct_red_sse2):
+
+PW_F184_MF076  times 4 dw  F_1_847,-F_0_765
+PW_F256_F089   times 4 dw  F_2_562, F_0_899
+PW_F106_MF217  times 4 dw  F_1_061,-F_2_172
+PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509
+PW_F145_MF021  times 4 dw  F_1_451,-F_0_211
+PW_F362_MF127  times 4 dw  F_3_624,-F_1_272
+PW_F085_MF072  times 4 dw  F_0_850,-F_0_720
+PD_DESCALE_P1_4        times 4 dd  1 << (DESCALE_P1_4-1)
+PD_DESCALE_P2_4        times 4 dd  1 << (DESCALE_P2_4-1)
+PD_DESCALE_P1_2        times 4 dd  1 << (DESCALE_P1_2-1)
+PD_DESCALE_P2_2        times 4 dd  1 << (DESCALE_P2_2-1)
+PB_CENTERJSAMP times 16 db CENTERJSAMPLE
+
+       alignz  16
+
+; --------------------------------------------------------------------------
+       SECTION SEG_TEXT
+       BITS    64
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_sse2 (void * dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void * dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13 = JDIMENSION output_col
+
+%define original_rbp   rbp+0
+%define wk(i)          rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+
+       align   16
+       global  EXTN(jsimd_idct_4x4_sse2)
+
+EXTN(jsimd_idct_4x4_sse2):
+       push    rbp
+       mov     rax,rsp                         ; rax = original rbp
+       sub     rsp, byte 4
+       and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+       mov     [rsp],eax
+       mov     rbp,rsp                         ; rbp = aligned rbp
+       lea     rsp, [wk(0)]
+       collect_args
+
+       ; ---- Pass 1: process columns from input.
+
+       mov     rdx, r10        ; quantptr
+       mov     rsi, r11                ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
+       mov     eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+       or      eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+       jnz     short .columnDCT
+
+       movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+       movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+       por     xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+       por     xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+       por     xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+       por     xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+       por     xmm0,xmm1
+       packsswb xmm0,xmm0
+       packsswb xmm0,xmm0
+       movd    eax,xmm0
+       test    rax,rax
+       jnz     short .columnDCT
+
+       ; -- AC terms all zero
+
+       movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+       pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+       psllw   xmm0,PASS1_BITS
+
+       movdqa    xmm3,xmm0     ; xmm0=in0=(00 01 02 03 04 05 06 07)
+       punpcklwd xmm0,xmm0     ; xmm0=(00 00 01 01 02 02 03 03)
+       punpckhwd xmm3,xmm3     ; xmm3=(04 04 05 05 06 06 07 07)
+
+       pshufd  xmm1,xmm0,0x50  ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
+       pshufd  xmm0,xmm0,0xFA  ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
+       pshufd  xmm6,xmm3,0x50  ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
+       pshufd  xmm3,xmm3,0xFA  ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
+
+       jmp     near .column_end
+%endif
+.columnDCT:
+
+       ; -- Odd part
+
+       movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+       movdqa  xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+       pmullw  xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+       pmullw  xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+       movdqa  xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+       movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+       pmullw  xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+       pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+       movdqa    xmm4,xmm0
+       movdqa    xmm5,xmm0
+       punpcklwd xmm4,xmm1
+       punpckhwd xmm5,xmm1
+       movdqa    xmm0,xmm4
+       movdqa    xmm1,xmm5
+       pmaddwd   xmm4,[PW_F256_F089]   ; xmm4=(tmp2L)
+       pmaddwd   xmm5,[PW_F256_F089]   ; xmm5=(tmp2H)
+       pmaddwd   xmm0,[PW_F106_MF217]  ; xmm0=(tmp0L)
+       pmaddwd   xmm1,[PW_F106_MF217]  ; xmm1=(tmp0H)
+
+       movdqa    xmm6,xmm2
+       movdqa    xmm7,xmm2
+       punpcklwd xmm6,xmm3
+       punpckhwd xmm7,xmm3
+       movdqa    xmm2,xmm6
+       movdqa    xmm3,xmm7
+       pmaddwd   xmm6,[PW_MF060_MF050] ; xmm6=(tmp2L)
+       pmaddwd   xmm7,[PW_MF060_MF050] ; xmm7=(tmp2H)
+       pmaddwd   xmm2,[PW_F145_MF021]  ; xmm2=(tmp0L)
+       pmaddwd   xmm3,[PW_F145_MF021]  ; xmm3=(tmp0H)
+
+       paddd   xmm6,xmm4               ; xmm6=tmp2L
+       paddd   xmm7,xmm5               ; xmm7=tmp2H
+       paddd   xmm2,xmm0               ; xmm2=tmp0L
+       paddd   xmm3,xmm1               ; xmm3=tmp0H
+
+       movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=tmp0L
+       movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=tmp0H
+
+       ; -- Even part
+
+       movdqa  xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+       movdqa  xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+       movdqa  xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+       pmullw  xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+       pmullw  xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+       pmullw  xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+       pxor      xmm1,xmm1
+       pxor      xmm2,xmm2
+       punpcklwd xmm1,xmm4             ; xmm1=tmp0L
+       punpckhwd xmm2,xmm4             ; xmm2=tmp0H
+       psrad     xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
+       psrad     xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
+
+       movdqa    xmm3,xmm5             ; xmm5=in2=z2
+       punpcklwd xmm5,xmm0             ; xmm0=in6=z3
+       punpckhwd xmm3,xmm0
+       pmaddwd   xmm5,[PW_F184_MF076]  ; xmm5=tmp2L
+       pmaddwd   xmm3,[PW_F184_MF076]  ; xmm3=tmp2H
+
+       movdqa  xmm4,xmm1
+       movdqa  xmm0,xmm2
+       paddd   xmm1,xmm5               ; xmm1=tmp10L
+       paddd   xmm2,xmm3               ; xmm2=tmp10H
+       psubd   xmm4,xmm5               ; xmm4=tmp12L
+       psubd   xmm0,xmm3               ; xmm0=tmp12H
+
+       ; -- Final output stage
+
+       movdqa  xmm5,xmm1
+       movdqa  xmm3,xmm2
+       paddd   xmm1,xmm6               ; xmm1=data0L
+       paddd   xmm2,xmm7               ; xmm2=data0H
+       psubd   xmm5,xmm6               ; xmm5=data3L
+       psubd   xmm3,xmm7               ; xmm3=data3H
+
+       movdqa  xmm6,[PD_DESCALE_P1_4]  ; xmm6=[PD_DESCALE_P1_4]
+
+       paddd   xmm1,xmm6
+       paddd   xmm2,xmm6
+       psrad   xmm1,DESCALE_P1_4
+       psrad   xmm2,DESCALE_P1_4
+       paddd   xmm5,xmm6
+       paddd   xmm3,xmm6
+       psrad   xmm5,DESCALE_P1_4
+       psrad   xmm3,DESCALE_P1_4
+
+       packssdw  xmm1,xmm2             ; xmm1=data0=(00 01 02 03 04 05 06 07)
+       packssdw  xmm5,xmm3             ; xmm5=data3=(30 31 32 33 34 35 36 37)
+
+       movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp0L
+       movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=tmp0H
+
+       movdqa  xmm2,xmm4
+       movdqa  xmm3,xmm0
+       paddd   xmm4,xmm7               ; xmm4=data1L
+       paddd   xmm0,xmm6               ; xmm0=data1H
+       psubd   xmm2,xmm7               ; xmm2=data2L
+       psubd   xmm3,xmm6               ; xmm3=data2H
+
+       movdqa  xmm7,[PD_DESCALE_P1_4]  ; xmm7=[PD_DESCALE_P1_4]
+
+       paddd   xmm4,xmm7
+       paddd   xmm0,xmm7
+       psrad   xmm4,DESCALE_P1_4
+       psrad   xmm0,DESCALE_P1_4
+       paddd   xmm2,xmm7
+       paddd   xmm3,xmm7
+       psrad   xmm2,DESCALE_P1_4
+       psrad   xmm3,DESCALE_P1_4
+
+       packssdw  xmm4,xmm0             ; xmm4=data1=(10 11 12 13 14 15 16 17)
+       packssdw  xmm2,xmm3             ; xmm2=data2=(20 21 22 23 24 25 26 27)
+
+       movdqa    xmm6,xmm1     ; transpose coefficients(phase 1)
+       punpcklwd xmm1,xmm4     ; xmm1=(00 10 01 11 02 12 03 13)
+       punpckhwd xmm6,xmm4     ; xmm6=(04 14 05 15 06 16 07 17)
+       movdqa    xmm7,xmm2     ; transpose coefficients(phase 1)
+       punpcklwd xmm2,xmm5     ; xmm2=(20 30 21 31 22 32 23 33)
+       punpckhwd xmm7,xmm5     ; xmm7=(24 34 25 35 26 36 27 37)
+
+       movdqa    xmm0,xmm1     ; transpose coefficients(phase 2)
+       punpckldq xmm1,xmm2     ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
+       punpckhdq xmm0,xmm2     ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
+       movdqa    xmm3,xmm6     ; transpose coefficients(phase 2)
+       punpckldq xmm6,xmm7     ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
+       punpckhdq xmm3,xmm7     ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
+.column_end:
+
+       ; -- Prefetch the next coefficient block
+
+       prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+       prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+       prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+       prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+       ; ---- Pass 2: process rows, store into output array.
+
+       mov     rax, [original_rbp]
+       mov     rdi, r12        ; (JSAMPROW *)
+       mov     rax, r13
+
+       ; -- Even part
+
+       pxor      xmm4,xmm4
+       punpcklwd xmm4,xmm1             ; xmm4=tmp0
+       psrad     xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
+
+       ; -- Odd part
+
+       punpckhwd xmm1,xmm0
+       punpckhwd xmm6,xmm3
+       movdqa    xmm5,xmm1
+       movdqa    xmm2,xmm6
+       pmaddwd   xmm1,[PW_F256_F089]   ; xmm1=(tmp2)
+       pmaddwd   xmm6,[PW_MF060_MF050] ; xmm6=(tmp2)
+       pmaddwd   xmm5,[PW_F106_MF217]  ; xmm5=(tmp0)
+       pmaddwd   xmm2,[PW_F145_MF021]  ; xmm2=(tmp0)
+
+       paddd     xmm6,xmm1             ; xmm6=tmp2
+       paddd     xmm2,xmm5             ; xmm2=tmp0
+
+       ; -- Even part
+
+       punpcklwd xmm0,xmm3
+       pmaddwd   xmm0,[PW_F184_MF076]  ; xmm0=tmp2
+
+       movdqa    xmm7,xmm4
+       paddd     xmm4,xmm0             ; xmm4=tmp10
+       psubd     xmm7,xmm0             ; xmm7=tmp12
+
+       ; -- Final output stage
+
+       movdqa  xmm1,[PD_DESCALE_P2_4]  ; xmm1=[PD_DESCALE_P2_4]
+
+       movdqa  xmm5,xmm4
+       movdqa  xmm3,xmm7
+       paddd   xmm4,xmm6               ; xmm4=data0=(00 10 20 30)
+       paddd   xmm7,xmm2               ; xmm7=data1=(01 11 21 31)
+       psubd   xmm5,xmm6               ; xmm5=data3=(03 13 23 33)
+       psubd   xmm3,xmm2               ; xmm3=data2=(02 12 22 32)
+
+       paddd   xmm4,xmm1
+       paddd   xmm7,xmm1
+       psrad   xmm4,DESCALE_P2_4
+       psrad   xmm7,DESCALE_P2_4
+       paddd   xmm5,xmm1
+       paddd   xmm3,xmm1
+       psrad   xmm5,DESCALE_P2_4
+       psrad   xmm3,DESCALE_P2_4
+
+       packssdw  xmm4,xmm3             ; xmm4=(00 10 20 30 02 12 22 32)
+       packssdw  xmm7,xmm5             ; xmm7=(01 11 21 31 03 13 23 33)
+
+       movdqa    xmm0,xmm4             ; transpose coefficients(phase 1)
+       punpcklwd xmm4,xmm7             ; xmm4=(00 01 10 11 20 21 30 31)
+       punpckhwd xmm0,xmm7             ; xmm0=(02 03 12 13 22 23 32 33)
+
+       movdqa    xmm6,xmm4             ; transpose coefficients(phase 2)
+       punpckldq xmm4,xmm0             ; xmm4=(00 01 02 03 10 11 12 13)
+       punpckhdq xmm6,xmm0             ; xmm6=(20 21 22 23 30 31 32 33)
+
+       packsswb  xmm4,xmm6             ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
+       paddb     xmm4,[PB_CENTERJSAMP]
+
+       pshufd    xmm2,xmm4,0x39        ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
+       pshufd    xmm1,xmm4,0x4E        ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
+       pshufd    xmm3,xmm4,0x93        ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
+
+       mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+       mov     rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+       movd    XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+       movd    XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
+       mov     rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+       mov     rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+       movd    XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
+       movd    XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+       uncollect_args
+       mov     rsp,rbp         ; rsp <- aligned rbp
+       pop     rsp             ; rsp <- original rbp
+       pop     rbp
+       ret
+
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_sse2 (void * dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void * dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13 = JDIMENSION output_col
+
+       align   16
+       global  EXTN(jsimd_idct_2x2_sse2)
+
+EXTN(jsimd_idct_2x2_sse2):
+       push    rbp
+       mov     rbp,rsp
+       push    rbx
+       collect_args
+
+       ; ---- Pass 1: process columns from input.
+
+       mov     rdx, r10        ; quantptr
+       mov     rsi, r11                ; inptr
+
+       ; | input:                  | result:        |
+       ; | 00 01 ** 03 ** 05 ** 07 |                |
+       ; | 10 11 ** 13 ** 15 ** 17 |                |
+       ; | ** ** ** ** ** ** ** ** |                |
+       ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+       ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+       ; | 50 51 ** 53 ** 55 ** 57 |                |
+       ; | ** ** ** ** ** ** ** ** |                |
+       ; | 70 71 ** 73 ** 75 ** 77 |                |
+
+       ; -- Odd part
+
+       movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+       movdqa  xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+       pmullw  xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+       pmullw  xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+       movdqa  xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+       movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+       pmullw  xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+       pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+       ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
+       ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
+
+       pcmpeqd   xmm7,xmm7
+       pslld     xmm7,WORD_BIT         ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
+
+       movdqa    xmm4,xmm0             ; xmm4=(10 11 ** 13 ** 15 ** 17)
+       movdqa    xmm5,xmm2             ; xmm5=(50 51 ** 53 ** 55 ** 57)
+       punpcklwd xmm4,xmm1             ; xmm4=(10 30 11 31 ** ** 13 33)
+       punpcklwd xmm5,xmm3             ; xmm5=(50 70 51 71 ** ** 53 73)
+       pmaddwd   xmm4,[PW_F362_MF127]
+       pmaddwd   xmm5,[PW_F085_MF072]
+
+       psrld   xmm0,WORD_BIT           ; xmm0=(11 -- 13 -- 15 -- 17 --)
+       pand    xmm1,xmm7               ; xmm1=(-- 31 -- 33 -- 35 -- 37)
+       psrld   xmm2,WORD_BIT           ; xmm2=(51 -- 53 -- 55 -- 57 --)
+       pand    xmm3,xmm7               ; xmm3=(-- 71 -- 73 -- 75 -- 77)
+       por     xmm0,xmm1               ; xmm0=(11 31 13 33 15 35 17 37)
+       por     xmm2,xmm3               ; xmm2=(51 71 53 73 55 75 57 77)
+       pmaddwd xmm0,[PW_F362_MF127]
+       pmaddwd xmm2,[PW_F085_MF072]
+
+       paddd   xmm4,xmm5               ; xmm4=tmp0[col0 col1 **** col3]
+       paddd   xmm0,xmm2               ; xmm0=tmp0[col1 col3 col5 col7]
+
+       ; -- Even part
+
+       movdqa  xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+       pmullw  xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+       ; xmm6=(00 01 ** 03 ** 05 ** 07)
+
+       movdqa  xmm1,xmm6               ; xmm1=(00 01 ** 03 ** 05 ** 07)
+       pslld   xmm6,WORD_BIT           ; xmm6=(-- 00 -- ** -- ** -- **)
+       pand    xmm1,xmm7               ; xmm1=(-- 01 -- 03 -- 05 -- 07)
+       psrad   xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
+       psrad   xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
+
+       ; -- Final output stage
+
+       movdqa  xmm3,xmm6
+       movdqa  xmm5,xmm1
+       paddd   xmm6,xmm4       ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
+       paddd   xmm1,xmm0       ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
+       psubd   xmm3,xmm4       ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
+       psubd   xmm5,xmm0       ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
+
+       movdqa  xmm2,[PD_DESCALE_P1_2]  ; xmm2=[PD_DESCALE_P1_2]
+
+       punpckldq  xmm6,xmm3            ; xmm6=(A0 B0 ** **)
+
+       movdqa     xmm7,xmm1
+       punpcklqdq xmm1,xmm5            ; xmm1=(A1 A3 B1 B3)
+       punpckhqdq xmm7,xmm5            ; xmm7=(A5 A7 B5 B7)
+
+       paddd   xmm6,xmm2
+       psrad   xmm6,DESCALE_P1_2
+
+       paddd   xmm1,xmm2
+       paddd   xmm7,xmm2
+       psrad   xmm1,DESCALE_P1_2
+       psrad   xmm7,DESCALE_P1_2
+
+       ; -- Prefetch the next coefficient block
+
+       prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+       prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+       prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+       prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+       ; ---- Pass 2: process rows, store into output array.
+
+       mov     rdi, r12        ; (JSAMPROW *)
+       mov     rax, r13
+
+       ; | input:| result:|
+       ; | A0 B0 |        |
+       ; | A1 B1 | C0 C1  |
+       ; | A3 B3 | D0 D1  |
+       ; | A5 B5 |        |
+       ; | A7 B7 |        |
+
+       ; -- Odd part
+
+       packssdw  xmm1,xmm1             ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
+       packssdw  xmm7,xmm7             ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
+       pmaddwd   xmm1,[PW_F362_MF127]
+       pmaddwd   xmm7,[PW_F085_MF072]
+
+       paddd     xmm1,xmm7             ; xmm1=tmp0[row0 row1 row0 row1]
+
+       ; -- Even part
+
+       pslld     xmm6,(CONST_BITS+2)   ; xmm6=tmp10[row0 row1 **** ****]
+
+       ; -- Final output stage
+
+       movdqa    xmm4,xmm6
+       paddd     xmm6,xmm1     ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
+       psubd     xmm4,xmm1     ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
+
+       punpckldq xmm6,xmm4     ; xmm6=(C0 D0 C1 D1)
+
+       paddd     xmm6,[PD_DESCALE_P2_2]
+       psrad     xmm6,DESCALE_P2_2
+
+       packssdw  xmm6,xmm6             ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
+       packsswb  xmm6,xmm6             ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
+       paddb     xmm6,[PB_CENTERJSAMP]
+
+       pextrw  ebx,xmm6,0x00           ; ebx=(C0 D0 -- --)
+       pextrw  ecx,xmm6,0x01           ; ecx=(C1 D1 -- --)
+
+       mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+       mov     rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+       mov     WORD [rdx+rax*SIZEOF_JSAMPLE], bx
+       mov     WORD [rsi+rax*SIZEOF_JSAMPLE], cx
+
+       uncollect_args
+       pop     rbx
+       pop     rbp
+       ret
diff --git a/common/jpeg/simd/jsimdcpu-64.asm b/common/jpeg/simd/jsimdcpu-64.asm
new file mode 100644 (file)
index 0000000..6eaf629
--- /dev/null
@@ -0,0 +1,95 @@
+;
+; jsimdcpu.asm - SIMD instruction support check
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+       SECTION SEG_TEXT
+       BITS    64
+;
+; Check if the CPU supports SIMD instructions
+;
+; GLOBAL(unsigned int)
+; jpeg_simd_cpu_support (void)
+;
+
+       align   16
+       global  EXTN(jpeg_simd_cpu_support)
+
+EXTN(jpeg_simd_cpu_support):
+       push    rbx
+
+       xor     rdi,rdi                 ; simd support flag
+
+       pushfq
+       pop     rax
+       mov     rdx,rax
+       xor     rax, 1<<21              ; flip ID bit in EFLAGS
+       push    rax
+       popfq
+       pushfq
+       pop     rax
+       xor     rax,rdx
+       jz      short .return           ; CPUID is not supported
+
+       ; Check for MMX instruction support
+       xor     rax,rax
+       cpuid
+       test    rax,rax
+       jz      short .return
+
+       xor     rax,rax
+       inc     rax
+       cpuid
+       mov     rax,rdx                 ; rax = Standard feature flags
+
+       test    rax, 1<<23              ; bit23:MMX
+       jz      short .no_mmx
+       or      rdi, byte JSIMD_MMX
+.no_mmx:
+       test    rax, 1<<25              ; bit25:SSE
+       jz      short .no_sse
+       or      rdi, byte JSIMD_SSE
+.no_sse:
+       test    rax, 1<<26              ; bit26:SSE2
+       jz      short .no_sse2
+       or      rdi, byte JSIMD_SSE2
+.no_sse2:
+
+       ; Check for 3DNow! instruction support
+       mov     eax, 0x80000000
+       cpuid
+       cmp     eax, 0x80000000
+       jbe     short .return
+
+       mov     rax, 0x80000001
+       cpuid
+       mov     rax,rdx                 ; eax = Extended feature flags
+
+       test    eax, 1<<31              ; bit31:3DNow!(vendor independent)
+       jz      short .no_3dnow
+       or      edi, byte JSIMD_3DNOW
+.no_3dnow:
+
+.return:
+       mov     rax,rdi
+
+       pop     rbx
+       ret
+
index d82b970e0d5581eb4e9fd11851944a73a3a0b523..878f0456da6e39ce14e064d9e62cc1ed523121c1 100644 (file)
 %define SEG_TEXT    .text  align=16 public use32 class=CODE
 %define SEG_CONST   .data  align=16 public use32 class=DATA
 
-%elifdef ELF   ; ----(nasm -felf -DELF ...)------------
+%elifdef ELF   ; ----(nasm -felf[64] -DELF ...)------------
 ; * Linux
 ; * *BSD family Unix using elf format
 ; * Unix System V, including Solaris x86, UnixWare and SCO Unix
 
 ; -- segment definition --
 ;
+%ifdef __x86_64__
+%define SEG_TEXT    .text   progbits align=16
+%define SEG_CONST   .rodata progbits align=16
+%else
 %define SEG_TEXT    .text   progbits alloc exec   nowrite align=16
 %define SEG_CONST   .rodata progbits alloc noexec nowrite align=16
+%endif
 
 ; To make the code position-independent, append -DPIC to the commandline
 ;
 ; --------------------------------------------------------------------------
 ;  Common types
 ;
+%ifdef __x86_64__
+%define POINTER                 qword           ; general pointer type
+%define SIZEOF_POINTER          SIZEOF_QWORD    ; sizeof(POINTER)
+%define POINTER_BIT             QWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
+%else
 %define POINTER                 dword           ; general pointer type
 %define SIZEOF_POINTER          SIZEOF_DWORD    ; sizeof(POINTER)
 %define POINTER_BIT             DWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
+%endif
 
 %define INT                     dword           ; signed integer type
 %define SIZEOF_INT              SIZEOF_DWORD    ; sizeof(INT)
@@ -268,6 +279,32 @@ const_base:
        align %1, db 0          ; filling zeros
 %endmacro
 
+%ifdef __x86_64__
+%imacro collect_args 0
+       push r10
+       push r11
+       push r12
+       push r13
+       push r14
+       push r15
+       mov r10, rdi
+       mov r11, rsi
+       mov r12, rdx
+       mov r13, rcx
+       mov r14, r8
+       mov r15, r9
+%endmacro
+
+%imacro uncollect_args 0
+       pop r15
+       pop r14
+       pop r13
+       pop r12
+       pop r11
+       pop r10
+%endmacro
+
+%endif
 
 ; --------------------------------------------------------------------------
 ;  Defines picked up from the C headers
index ef5a591e77fe2265ecace17cccaf10009c9704da..68893b70e2a588d9a95c6e85ac48b76df640950e 100755 (executable)
@@ -11,7 +11,7 @@ while [ $# -gt 0 ]; do
                 pic=yes
             fi
             ;;
-        -f|-fbin|-faout|-faoutb|-fcoff|-felf|-fas86| \
+        -f|-fbin|-faout|-faoutb|-fcoff|-felf|-felf64|-fas86| \
         -fobj|-fwin32|-frdf|-fieee|-fmacho)
             # it's a file format specifier for nasm.
             command="$command $1"