diff options
author | Pierre Ossman <ossman@cendio.se> | 2009-03-09 13:23:04 +0000 |
---|---|---|
committer | Pierre Ossman <ossman@cendio.se> | 2009-03-09 13:23:04 +0000 |
commit | 4aa2429c35601d34b1755e9731329b65c0bcd6db (patch) | |
tree | bf365bfb75e0cbeeb91a91f25811a1506a815bb0 /common/jpeg/jcdctmgr.c | |
parent | 82c7f3138cd9796b93e26bdb8f7f797b48150ae4 (diff) | |
download | tigervnc-4aa2429c35601d34b1755e9731329b65c0bcd6db.tar.gz tigervnc-4aa2429c35601d34b1755e9731329b65c0bcd6db.zip |
"Optimise" quantization step by replacing the division by a multiplication.
This has no measurable difference right now but makes it possible to do
SIMD implementations of this stage.
git-svn-id: svn://svn.code.sf.net/p/tigervnc/code/trunk@3647 3789f03b-4d11-0410-bbf8-ca57d06f2519
Diffstat (limited to 'common/jpeg/jcdctmgr.c')
-rw-r--r-- | common/jpeg/jcdctmgr.c | 174 |
1 files changed, 143 insertions, 31 deletions
diff --git a/common/jpeg/jcdctmgr.c b/common/jpeg/jcdctmgr.c index 75d48e02..539ccc41 100644 --- a/common/jpeg/jcdctmgr.c +++ b/common/jpeg/jcdctmgr.c @@ -2,6 +2,7 @@ * jcdctmgr.c * * Copyright (C) 1994-1996, Thomas G. Lane. + * Copyright (C) 1999-2006, MIYASAKA Masaru. * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB * This file is part of the Independent JPEG Group's software. * For conditions of distribution and use, see the accompanying README file. @@ -65,6 +66,128 @@ typedef my_fdct_controller * my_fdct_ptr; /* + * Find the highest bit in an integer through binary search. + */ +LOCAL(int) +fls (UINT16 val) +{ + int bit; + + bit = 16; + + if (!val) + return 0; + + if (!(val & 0xff00)) { + bit -= 8; + val <<= 8; + } + if (!(val & 0xf000)) { + bit -= 4; + val <<= 4; + } + if (!(val & 0xc000)) { + bit -= 2; + val <<= 2; + } + if (!(val & 0x8000)) { + bit -= 1; + val <<= 1; + } + + return bit; +} + +/* + * Compute values to do a division using reciprocal. + * + * This implementation is based on an algorithm described in + * "How to optimize for the Pentium family of microprocessors" + * (http://www.agner.org/assem/). + * More information about the basic algorithm can be found in + * the paper "Integer Division Using Reciprocals" by Robert Alverson. + * + * The basic idea is to replace x/d by x * d^-1. In order to store + * d^-1 with enough precision we shift it left a few places. It turns + * out that this algoright gives just enough precision, and also fits + * into DCTELEM: + * + * b = (the number of significant bits in divisor) - 1 + * r = (word size) + b + * f = 2^r / divisor + * + * f will not be an integer for most cases, so we need to compensate + * for the rounding error introduced: + * + * no fractional part: + * + * result = input >> r + * + * fractional part of f < 0.5: + * + * round f down to nearest integer + * result = ((input + 1) * f) >> r + * + * fractional part of f > 0.5: + * + * round f up to nearest integer + * result = (input * f) >> r + * + * This is the original algorithm that gives truncated results. But we + * want properly rounded results, so we replace "input" with + * "input + divisor/2". + * + * In order to allow SIMD implementations we also tweak the values to + * allow the same calculation to be made at all times: + * + * dctbl[0] = f rounded to nearest integer + * dctbl[1] = divisor / 2 (+ 1 if fractional part of f < 0.5) + * dctbl[2] = 1 << ((word size) * 2 - r) + * dctbl[3] = r - (word size) + * + * dctbl[2] is for stupid instruction sets where the shift operation + * isn't member wise (e.g. MMX). + * + * The reason dctbl[2] and dctbl[3] reduce the shift with (word size) + * is that most SIMD implementations have a "multiply and store top + * half" operation. + * + * Lastly, we store each of the values in their own table instead + * of in a consecutive manner, yet again in order to allow SIMD + * routines. + */ +LOCAL(void) +compute_reciprocal (UINT16 divisor, DCTELEM * dtbl) +{ + UDCTELEM2 fq, fr; + UDCTELEM c; + int b, r; + + b = fls(divisor) - 1; + r = sizeof(DCTELEM) * 8 + b; + + fq = ((UDCTELEM2)1 << r) / divisor; + fr = ((UDCTELEM2)1 << r) % divisor; + + c = divisor / 2; /* for rounding */ + + if (fr == 0) { /* divisor is power of two */ + /* fq will be one bit too large to fit in DCTELEM, so adjust */ + fq >>= 1; + r--; + } else if (fr <= (divisor / 2)) { /* fractional part is < 0.5 */ + c++; + } else { /* fractional part is > 0.5 */ + fq++; + } + + dtbl[DCTSIZE2 * 0] = (DCTELEM) fq; /* reciprocal */ + dtbl[DCTSIZE2 * 1] = (DCTELEM) c; /* correction + roundfactor */ + dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (sizeof(DCTELEM)*8*2 - r)); /* scale */ + dtbl[DCTSIZE2 * 3] = (DCTELEM) r - sizeof(DCTELEM)*8; /* shift */ +} + +/* * Initialize for a processing pass. * Verify that all referenced Q-tables are present, and set up * the divisor table for each one. @@ -101,11 +224,11 @@ start_pass_fdctmgr (j_compress_ptr cinfo) if (fdct->divisors[qtblno] == NULL) { fdct->divisors[qtblno] = (DCTELEM *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - DCTSIZE2 * SIZEOF(DCTELEM)); + (DCTSIZE2 * 4) * SIZEOF(DCTELEM)); } dtbl = fdct->divisors[qtblno]; for (i = 0; i < DCTSIZE2; i++) { - dtbl[i] = ((DCTELEM) qtbl->quantval[i]) << 3; + compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]); } break; #endif @@ -135,14 +258,14 @@ start_pass_fdctmgr (j_compress_ptr cinfo) if (fdct->divisors[qtblno] == NULL) { fdct->divisors[qtblno] = (DCTELEM *) (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE, - DCTSIZE2 * SIZEOF(DCTELEM)); + (DCTSIZE2 * 4) * SIZEOF(DCTELEM)); } dtbl = fdct->divisors[qtblno]; for (i = 0; i < DCTSIZE2; i++) { - dtbl[i] = (DCTELEM) + compute_reciprocal( DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i], (INT32) aanscales[i]), - CONST_BITS-3); + CONST_BITS-3), &dtbl[i]); } } break; @@ -233,41 +356,30 @@ convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM * workspace) METHODDEF(void) quantize (JCOEFPTR coef_block, DCTELEM * divisors, DCTELEM * workspace) { - register DCTELEM temp, qval; - register int i; - register JCOEFPTR output_ptr = coef_block; + int i; + DCTELEM temp; + UDCTELEM recip, corr, shift; + UDCTELEM2 product; + JCOEFPTR output_ptr = coef_block; for (i = 0; i < DCTSIZE2; i++) { - qval = divisors[i]; temp = workspace[i]; - - /* Divide the coefficient value by qval, ensuring proper rounding. - * Since C does not specify the direction of rounding for negative - * quotients, we have to force the dividend positive for portability. - * - * In most files, at least half of the output values will be zero - * (at default quantization settings, more like three-quarters...) - * so we should ensure that this case is fast. On many machines, - * a comparison is enough cheaper than a divide to make a special test - * a win. Since both inputs will be nonnegative, we need only test - * for a < b to discover whether a/b is 0. - * If your machine's division is fast enough, define FAST_DIVIDE. - */ -#ifdef FAST_DIVIDE -#define DIVIDE_BY(a,b) a /= b -#else -#define DIVIDE_BY(a,b) if (a >= b) a /= b; else a = 0 -#endif + recip = divisors[i + DCTSIZE2 * 0]; + corr = divisors[i + DCTSIZE2 * 1]; + shift = divisors[i + DCTSIZE2 * 3]; if (temp < 0) { temp = -temp; - temp += qval>>1; /* for rounding */ - DIVIDE_BY(temp, qval); + product = (UDCTELEM2)(temp + corr) * recip; + product >>= shift + sizeof(DCTELEM)*8; + temp = product; temp = -temp; } else { - temp += qval>>1; /* for rounding */ - DIVIDE_BY(temp, qval); + product = (UDCTELEM2)(temp + corr) * recip; + product >>= shift + sizeof(DCTELEM)*8; + temp = product; } + output_ptr[i] = (JCOEF) temp; } } |