From b85c2f8ae8090c039b40d6f01c004632fbd8be17 Mon Sep 17 00:00:00 2001 From: Pierre Ossman Date: Mon, 9 Mar 2009 10:37:20 +0000 Subject: [PATCH] Split up the forward DCT routine into three stages Divide it into sample conversion, DCT and quantization in order to easily provide alternative implementations of each stage. git-svn-id: svn://svn.code.sf.net/p/tigervnc/code/trunk@3644 3789f03b-4d11-0410-bbf8-ca57d06f2519 --- common/jpeg/jcdctmgr.c | 307 ++++++++++++++++++++++++++--------------- common/jpeg/jdct.h | 3 - 2 files changed, 199 insertions(+), 111 deletions(-) diff --git a/common/jpeg/jcdctmgr.c b/common/jpeg/jcdctmgr.c index 61fa79b9..45b9c0dc 100644 --- a/common/jpeg/jcdctmgr.c +++ b/common/jpeg/jcdctmgr.c @@ -19,11 +19,30 @@ /* Private subobject for this module */ +typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data)); +typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data)); + +typedef JMETHOD(void, convsamp_method_ptr, + (JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM * workspace)); +typedef JMETHOD(void, float_convsamp_method_ptr, + (JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT *workspace)); + +typedef JMETHOD(void, quantize_method_ptr, + (JCOEFPTR coef_block, DCTELEM * divisors, + DCTELEM * workspace)); +typedef JMETHOD(void, float_quantize_method_ptr, + (JCOEFPTR coef_block, FAST_FLOAT * divisors, + FAST_FLOAT * workspace)); + typedef struct { struct jpeg_forward_dct pub; /* public fields */ /* Pointer to the DCT routine actually in use */ - forward_DCT_method_ptr do_dct; + forward_DCT_method_ptr dct; + convsamp_method_ptr convsamp; + quantize_method_ptr quantize; /* The actual post-DCT divisors --- not identical to the quant table * entries, because of scaling (especially for an unnormalized DCT). @@ -33,7 +52,9 @@ typedef struct { #ifdef DCT_FLOAT_SUPPORTED /* Same as above for the floating-point case. */ - float_DCT_method_ptr do_float_dct; + float_DCT_method_ptr float_dct; + float_convsamp_method_ptr float_convsamp; + float_quantize_method_ptr float_quantize; FAST_FLOAT * float_divisors[NUM_QUANT_TBLS]; #endif } my_fdct_controller; @@ -168,6 +189,88 @@ start_pass_fdctmgr (j_compress_ptr cinfo) } +/* + * Load data into workspace, applying unsigned->signed conversion. + */ + +METHODDEF(void) +convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace) +{ + register DCTELEM *workspaceptr; + register JSAMPROW elemptr; + register int elemr; + + workspaceptr = workspace; + for (elemr = 0; elemr < DCTSIZE; elemr++) { + elemptr = sample_data[elemr] + start_col; + +#if DCTSIZE == 8 /* unroll the inner loop */ + *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; + *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; + *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; + *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; + *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; + *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; + *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; + *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; +#else + { + register int elemc; + for (elemc = DCTSIZE; elemc > 0; elemc--) + *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; + } +#endif + } +} + + +/* + * Quantize/descale the coefficients, and store into coef_blocks[]. + */ + +METHODDEF(void) +quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace) +{ + register DCTELEM temp, qval; + register int i; + register JCOEFPTR output_ptr = coef_block; + + for (i = 0; i < DCTSIZE2; i++) { + qval = divisors[i]; + temp = workspace[i]; + + /* Divide the coefficient value by qval, ensuring proper rounding. + * Since C does not specify the direction of rounding for negative + * quotients, we have to force the dividend positive for portability. + * + * In most files, at least half of the output values will be zero + * (at default quantization settings, more like three-quarters...) + * so we should ensure that this case is fast. On many machines, + * a comparison is enough cheaper than a divide to make a special test + * a win. Since both inputs will be nonnegative, we need only test + * for a < b to discover whether a/b is 0. + * If your machine's division is fast enough, define FAST_DIVIDE. + */ +#ifdef FAST_DIVIDE +#define DIVIDE_BY(a,b) a /= b +#else +#define DIVIDE_BY(a,b) if (a >= b) a /= b; else a = 0 +#endif + + if (temp < 0) { + temp = -temp; + temp += qval>>1; /* for rounding */ + DIVIDE_BY(temp, qval); + temp = -temp; + } else { + temp += qval>>1; /* for rounding */ + DIVIDE_BY(temp, qval); + } + output_ptr[i] = (JCOEF) temp; + } +} + + /* * Perform forward DCT on one or more blocks of a component. * @@ -185,86 +288,85 @@ forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr, { /* This routine is heavily used, so it's worth coding it tightly. */ my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct; - forward_DCT_method_ptr do_dct = fdct->do_dct; DCTELEM * divisors = fdct->divisors[compptr->quant_tbl_no]; DCTELEM workspace[DCTSIZE2]; /* work area for FDCT subroutine */ JDIMENSION bi; + /* Make sure the compiler doesn't look up these every pass */ + forward_DCT_method_ptr do_dct = fdct->dct; + convsamp_method_ptr do_convsamp = fdct->convsamp; + quantize_method_ptr do_quantize = fdct->quantize; + sample_data += start_row; /* fold in the vertical offset once */ for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) { /* Load data into workspace, applying unsigned->signed conversion */ - { register DCTELEM *workspaceptr; - register JSAMPROW elemptr; - register int elemr; - - workspaceptr = workspace; - for (elemr = 0; elemr < DCTSIZE; elemr++) { - elemptr = sample_data[elemr] + start_col; -#if DCTSIZE == 8 /* unroll the inner loop */ - *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; - *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; - *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; - *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; - *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; - *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; - *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; - *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; -#else - { register int elemc; - for (elemc = DCTSIZE; elemc > 0; elemc--) { - *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE; - } - } -#endif - } - } + (*do_convsamp) (sample_data, start_col, workspace); /* Perform the DCT */ (*do_dct) (workspace); /* Quantize/descale the coefficients, and store into coef_blocks[] */ - { register DCTELEM temp, qval; - register int i; - register JCOEFPTR output_ptr = coef_blocks[bi]; + (*do_quantize) (coef_blocks[bi], divisors, workspace); + } +} - for (i = 0; i < DCTSIZE2; i++) { - qval = divisors[i]; - temp = workspace[i]; - /* Divide the coefficient value by qval, ensuring proper rounding. - * Since C does not specify the direction of rounding for negative - * quotients, we have to force the dividend positive for portability. - * - * In most files, at least half of the output values will be zero - * (at default quantization settings, more like three-quarters...) - * so we should ensure that this case is fast. On many machines, - * a comparison is enough cheaper than a divide to make a special test - * a win. Since both inputs will be nonnegative, we need only test - * for a < b to discover whether a/b is 0. - * If your machine's division is fast enough, define FAST_DIVIDE. - */ -#ifdef FAST_DIVIDE -#define DIVIDE_BY(a,b) a /= b + +#ifdef DCT_FLOAT_SUPPORTED + + +METHODDEF(void) +convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * workspace) +{ + register FAST_FLOAT *workspaceptr; + register JSAMPROW elemptr; + register int elemr; + + workspaceptr = workspace; + for (elemr = 0; elemr < DCTSIZE; elemr++) { + elemptr = sample_data[elemr] + start_col; +#if DCTSIZE == 8 /* unroll the inner loop */ + *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); + *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); + *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); + *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); + *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); + *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); + *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); + *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); #else -#define DIVIDE_BY(a,b) if (a >= b) a /= b; else a = 0 -#endif - if (temp < 0) { - temp = -temp; - temp += qval>>1; /* for rounding */ - DIVIDE_BY(temp, qval); - temp = -temp; - } else { - temp += qval>>1; /* for rounding */ - DIVIDE_BY(temp, qval); - } - output_ptr[i] = (JCOEF) temp; - } + { + register int elemc; + for (elemc = DCTSIZE; elemc > 0; elemc--) + *workspaceptr++ = (FAST_FLOAT) + (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); } +#endif } } -#ifdef DCT_FLOAT_SUPPORTED +METHODDEF(void) +quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace) +{ + register FAST_FLOAT temp; + register int i; + register JCOEFPTR output_ptr = coef_block; + + for (i = 0; i < DCTSIZE2; i++) { + /* Apply the quantization and scaling factor */ + temp = workspace[i] * divisors[i]; + + /* Round to nearest integer. + * Since C does not specify the direction of rounding for negative + * quotients, we have to force the dividend positive for portability. + * The maximum coefficient size is +-16K (for 12-bit data), so this + * code should work for either 16-bit or 32-bit ints. + */ + output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384); + } +} + METHODDEF(void) forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr, @@ -275,62 +377,26 @@ forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr, { /* This routine is heavily used, so it's worth coding it tightly. */ my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct; - float_DCT_method_ptr do_dct = fdct->do_float_dct; FAST_FLOAT * divisors = fdct->float_divisors[compptr->quant_tbl_no]; FAST_FLOAT workspace[DCTSIZE2]; /* work area for FDCT subroutine */ JDIMENSION bi; + /* Make sure the compiler doesn't look up these every pass */ + float_DCT_method_ptr do_dct = fdct->float_dct; + float_convsamp_method_ptr do_convsamp = fdct->float_convsamp; + float_quantize_method_ptr do_quantize = fdct->float_quantize; + sample_data += start_row; /* fold in the vertical offset once */ for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) { /* Load data into workspace, applying unsigned->signed conversion */ - { register FAST_FLOAT *workspaceptr; - register JSAMPROW elemptr; - register int elemr; - - workspaceptr = workspace; - for (elemr = 0; elemr < DCTSIZE; elemr++) { - elemptr = sample_data[elemr] + start_col; -#if DCTSIZE == 8 /* unroll the inner loop */ - *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); - *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); - *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); - *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); - *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); - *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); - *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); - *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); -#else - { register int elemc; - for (elemc = DCTSIZE; elemc > 0; elemc--) { - *workspaceptr++ = (FAST_FLOAT) - (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE); - } - } -#endif - } - } + (*do_convsamp) (sample_data, start_col, workspace); /* Perform the DCT */ (*do_dct) (workspace); /* Quantize/descale the coefficients, and store into coef_blocks[] */ - { register FAST_FLOAT temp; - register int i; - register JCOEFPTR output_ptr = coef_blocks[bi]; - - for (i = 0; i < DCTSIZE2; i++) { - /* Apply the quantization and scaling factor */ - temp = workspace[i] * divisors[i]; - /* Round to nearest integer. - * Since C does not specify the direction of rounding for negative - * quotients, we have to force the dividend positive for portability. - * The maximum coefficient size is +-16K (for 12-bit data), so this - * code should work for either 16-bit or 32-bit ints. - */ - output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384); - } - } + (*do_quantize) (coef_blocks[bi], divisors, workspace); } } @@ -353,23 +419,48 @@ jinit_forward_dct (j_compress_ptr cinfo) cinfo->fdct = (struct jpeg_forward_dct *) fdct; fdct->pub.start_pass = start_pass_fdctmgr; + /* First determine the DCT... */ switch (cinfo->dct_method) { #ifdef DCT_ISLOW_SUPPORTED case JDCT_ISLOW: fdct->pub.forward_DCT = forward_DCT; - fdct->do_dct = jpeg_fdct_islow; + fdct->dct = jpeg_fdct_islow; break; #endif #ifdef DCT_IFAST_SUPPORTED case JDCT_IFAST: fdct->pub.forward_DCT = forward_DCT; - fdct->do_dct = jpeg_fdct_ifast; + fdct->dct = jpeg_fdct_ifast; break; #endif #ifdef DCT_FLOAT_SUPPORTED case JDCT_FLOAT: fdct->pub.forward_DCT = forward_DCT_float; - fdct->do_float_dct = jpeg_fdct_float; + fdct->float_dct = jpeg_fdct_float; + break; +#endif + default: + ERREXIT(cinfo, JERR_NOT_COMPILED); + break; + } + + /* ...then the supporting stages. */ + switch (cinfo->dct_method) { +#ifdef DCT_ISLOW_SUPPORTED + case JDCT_ISLOW: +#endif +#ifdef DCT_IFAST_SUPPORTED + case JDCT_IFAST: +#endif +#if defined(DCT_ISLOW_SUPPORTED) || defined(DCT_IFAST_SUPPORTED) + fdct->convsamp = convsamp; + fdct->quantize = quantize; + break; +#endif +#ifdef DCT_FLOAT_SUPPORTED + case JDCT_FLOAT: + fdct->float_convsamp = convsamp_float; + fdct->float_quantize = quantize_float; break; #endif default: diff --git a/common/jpeg/jdct.h b/common/jpeg/jdct.h index 04192a26..c3650ea0 100644 --- a/common/jpeg/jdct.h +++ b/common/jpeg/jdct.h @@ -32,9 +32,6 @@ typedef int DCTELEM; /* 16 or 32 bits is fine */ typedef INT32 DCTELEM; /* must have 32 bits */ #endif -typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data)); -typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data)); - /* * An inverse DCT routine is given a pointer to the input JBLOCK and a pointer -- 2.39.5