From c02c05d403f000d15f736458b8c26ad60b4811f0 Mon Sep 17 00:00:00 2001
From: Pierre Ossman <ossman@cendio.se>
Date: Thu, 30 Jan 2014 10:47:07 +0100
Subject: [PATCH] Add optimised buffer conversion

Handles the common cases when the target or source are in
the preferred 888 format.

If one of the buffers is not 888, then it must also be properly
aligned (which is commonly the case). Performance is now in many
cases on par with PixelTransformer.
---
 common/rfb/PixelFormat.cxx    | 113 +++++++++++++++++++++++++
 common/rfb/PixelFormat.h      |  23 +++++
 common/rfb/PixelFormatBPP.cxx | 155 ++++++++++++++++++++++++++++++++++
 3 files changed, 291 insertions(+)
 create mode 100644 common/rfb/PixelFormatBPP.cxx

diff --git a/common/rfb/PixelFormat.cxx b/common/rfb/PixelFormat.cxx
index 53b7ea50..918e215f 100644
--- a/common/rfb/PixelFormat.cxx
+++ b/common/rfb/PixelFormat.cxx
@@ -19,6 +19,7 @@
  */
 #include <assert.h>
 #include <stdio.h>
+#include <stdint.h>
 #include <string.h>
 #include <rdr/InStream.h>
 #include <rdr/OutStream.h>
@@ -357,6 +358,8 @@ void PixelFormat::bufferFromBuffer(rdr::U8* dst, const PixelFormat &srcPF,
   bufferFromBuffer(dst, srcPF, src, pixels, 1, pixels, pixels);
 }
 
+#define IS_ALIGNED(v, a) (((intptr_t)v & (a-1)) == 0)
+
 void PixelFormat::bufferFromBuffer(rdr::U8* dst, const PixelFormat &srcPF,
                                    const rdr::U8* src, int w, int h,
                                    int dstStride, int srcStride) const
@@ -368,6 +371,77 @@ void PixelFormat::bufferFromBuffer(rdr::U8* dst, const PixelFormat &srcPF,
       dst += dstStride * bpp/8;
       src += srcStride * srcPF.bpp/8;
     }
+  } else if (is888() && srcPF.is888()) {
+    // Optimised common case A: byte shuffling (e.g. endian conversion)
+    rdr::U8 *d[4];
+    int dstPad, srcPad;
+
+    if (bigEndian != srcPF.bigEndian) {
+      d[(24 - srcPF.redShift)/8] = dst + (24 - redShift)/8;
+      d[(24 - srcPF.greenShift)/8] = dst + (24 - greenShift)/8;
+      d[(24 - srcPF.blueShift)/8] = dst + (24 - blueShift)/8;
+      d[(24 - (48 - srcPF.redShift - srcPF.greenShift - srcPF.blueShift))/8] =
+        dst + (24 - (48 - redShift - greenShift - blueShift))/8;
+    } else {
+      d[srcPF.redShift/8] = dst + redShift/8;
+      d[srcPF.greenShift/8] = dst + greenShift/8;
+      d[srcPF.blueShift/8] = dst + blueShift/8;
+      d[(48 - srcPF.redShift - srcPF.greenShift - srcPF.blueShift)/8] =
+        dst + (48 - redShift - greenShift - blueShift)/8;
+    }
+
+    dstPad = (dstStride - w) * 4;
+    srcPad = (srcStride - w) * 4;
+    while (h--) {
+      int w_ = w;
+      while (w_--) {
+        *d[0] = *(src++);
+        *d[1] = *(src++);
+        *d[2] = *(src++);
+        *d[3] = *(src++);
+        d[0] += 4;
+        d[1] += 4;
+        d[2] += 4;
+        d[3] += 4;
+      }
+      d[0] += dstPad;
+      d[1] += dstPad;
+      d[2] += dstPad;
+      d[3] += dstPad;
+      src += srcPad;
+    }
+  } else if (IS_ALIGNED(dst, bpp/8) && srcPF.is888()) {
+    // Optimised common case B: 888 source
+    switch (bpp) {
+    case 8:
+      directBufferFromBufferFrom888((rdr::U8*)dst, srcPF, src,
+                                    w, h, dstStride, srcStride);
+      break;
+    case 16:
+      directBufferFromBufferFrom888((rdr::U16*)dst, srcPF, src,
+                                    w, h, dstStride, srcStride);
+      break;
+    case 32:
+      directBufferFromBufferFrom888((rdr::U32*)dst, srcPF, src,
+                                    w, h, dstStride, srcStride);
+      break;
+    }
+  } else if (IS_ALIGNED(src, srcPF.bpp/8) && is888()) {
+    // Optimised common case C: 888 destination
+    switch (srcPF.bpp) {
+    case 8:
+      directBufferFromBufferTo888(dst, srcPF, (rdr::U8*)src,
+                                  w, h, dstStride, srcStride);
+      break;
+    case 16:
+      directBufferFromBufferTo888(dst, srcPF, (rdr::U16*)src,
+                                  w, h, dstStride, srcStride);
+      break;
+    case 32:
+      directBufferFromBufferTo888(dst, srcPF, (rdr::U32*)src,
+                                  w, h, dstStride, srcStride);
+      break;
+    }
   } else {
     // Generic code
     int dstPad = (dstStride - w) * bpp/8;
@@ -600,3 +674,42 @@ bool PixelFormat::isSane(void)
 
   return true;
 }
+
+// Preprocessor generated, optimised methods
+
+#define INBPP 8
+#define OUTBPP 8
+#include "PixelFormatBPP.cxx"
+#undef OUTBPP
+#define OUTBPP 16
+#include "PixelFormatBPP.cxx"
+#undef OUTBPP
+#define OUTBPP 32
+#include "PixelFormatBPP.cxx"
+#undef OUTBPP
+#undef INBPP
+
+#define INBPP 16
+#define OUTBPP 8
+#include "PixelFormatBPP.cxx"
+#undef OUTBPP
+#define OUTBPP 16
+#include "PixelFormatBPP.cxx"
+#undef OUTBPP
+#define OUTBPP 32
+#include "PixelFormatBPP.cxx"
+#undef OUTBPP
+#undef INBPP
+
+#define INBPP 32
+#define OUTBPP 8
+#include "PixelFormatBPP.cxx"
+#undef OUTBPP
+#define OUTBPP 16
+#include "PixelFormatBPP.cxx"
+#undef OUTBPP
+#define OUTBPP 32
+#include "PixelFormatBPP.cxx"
+#undef OUTBPP
+#undef INBPP
+
diff --git a/common/rfb/PixelFormat.h b/common/rfb/PixelFormat.h
index b18045f7..113b8eef 100644
--- a/common/rfb/PixelFormat.h
+++ b/common/rfb/PixelFormat.h
@@ -90,6 +90,29 @@ namespace rfb {
     void updateState(void);
     bool isSane(void);
 
+  private:
+    // Preprocessor generated, optimised methods
+
+    void directBufferFromBufferFrom888(rdr::U8* dst, const PixelFormat &srcPF,
+                                       const rdr::U8* src, int w, int h,
+                                       int dstStride, int srcStride) const;
+    void directBufferFromBufferFrom888(rdr::U16* dst, const PixelFormat &srcPF,
+                                       const rdr::U8* src, int w, int h,
+                                       int dstStride, int srcStride) const;
+    void directBufferFromBufferFrom888(rdr::U32* dst, const PixelFormat &srcPF,
+                                       const rdr::U8* src, int w, int h,
+                                       int dstStride, int srcStride) const;
+
+    void directBufferFromBufferTo888(rdr::U8* dst, const PixelFormat &srcPF,
+                                     const rdr::U8* src, int w, int h,
+                                     int dstStride, int srcStride) const;
+    void directBufferFromBufferTo888(rdr::U8* dst, const PixelFormat &srcPF,
+                                     const rdr::U16* src, int w, int h,
+                                     int dstStride, int srcStride) const;
+    void directBufferFromBufferTo888(rdr::U8* dst, const PixelFormat &srcPF,
+                                     const rdr::U32* src, int w, int h,
+                                     int dstStride, int srcStride) const;
+
   public:
     int bpp;
     int depth;
diff --git a/common/rfb/PixelFormatBPP.cxx b/common/rfb/PixelFormatBPP.cxx
new file mode 100644
index 00000000..6b5ad6bb
--- /dev/null
+++ b/common/rfb/PixelFormatBPP.cxx
@@ -0,0 +1,155 @@
+/* Copyright 2014 Pierre Ossman for Cendio AB
+ * 
+ * This is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This software is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this software; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
+ * USA.
+ */
+
+#define CONCAT2(a,b) a##b
+#define CONCAT2E(a,b) CONCAT2(a,b)
+
+#define UIN CONCAT2E(U,INBPP)
+#define UOUT CONCAT2E(U,OUTBPP)
+
+#define SWAP16(n) ((((n) & 0xff) << 8) | (((n) >> 8) & 0xff))
+#define SWAP32(n) (((n) >> 24) | (((n) & 0x00ff0000) >> 8) | \
+                   (((n) & 0x0000ff00) << 8) | ((n) << 24))
+
+#define SWAPIN CONCAT2E(SWAP,INBPP)
+#define SWAPOUT CONCAT2E(SWAP,OUTBPP)
+
+#if INBPP == 32
+
+void PixelFormat::directBufferFromBufferFrom888(rdr::UOUT* dst,
+                                                const PixelFormat &srcPF,
+                                                const rdr::U8* src,
+                                                int w, int h,
+                                                int dstStride,
+                                                int srcStride) const
+{
+  const rdr::U8 *r, *g, *b;
+  int dstPad, srcPad;
+
+  int redTruncShift, greenTruncShift, blueTruncShift;
+
+  redTruncShift = 8 - redBits;
+  greenTruncShift = 8 - greenBits;
+  blueTruncShift = 8 - blueBits;
+
+  if (srcPF.bigEndian) {
+    r = src + (24 - srcPF.redShift)/8;
+    g = src + (24 - srcPF.greenShift)/8;
+    b = src + (24 - srcPF.blueShift)/8;
+  } else {
+    r = src + srcPF.redShift/8;
+    g = src + srcPF.greenShift/8;
+    b = src + srcPF.blueShift/8;
+  }
+
+  dstPad = (dstStride - w);
+  srcPad = (srcStride - w) * 4;
+  while (h--) {
+    int w_ = w;
+    while (w_--) {
+      rdr::UOUT d;
+
+      d = (*r >> redTruncShift) << redShift;
+      d |= (*g >> greenTruncShift) << greenShift;
+      d |= (*b >> blueTruncShift) << blueShift;
+
+#if OUTBPP != 8
+      if (endianMismatch)
+        d = SWAPOUT(d);
+#endif
+
+      *dst = d;
+
+      dst++;
+      r += 4;
+      g += 4;
+      b += 4;
+    }
+    dst += dstPad;
+    r += srcPad;
+    g += srcPad;
+    b += srcPad;
+  }
+}
+
+#endif /* INBPP == 32 */
+
+#if OUTBPP == 32
+
+void PixelFormat::directBufferFromBufferTo888(rdr::U8* dst,
+                                              const PixelFormat &srcPF,
+                                              const rdr::UIN* src,
+                                              int w, int h,
+                                              int dstStride,
+                                              int srcStride) const
+{
+  rdr::U8 *r, *g, *b, *x;
+  int dstPad, srcPad;
+
+  const rdr::U8 *redUpTable, *greenUpTable, *blueUpTable;
+
+  redUpTable = &upconvTable[(srcPF.redBits-1)*256];
+  greenUpTable = &upconvTable[(srcPF.greenBits-1)*256];
+  blueUpTable = &upconvTable[(srcPF.blueBits-1)*256];
+
+  if (bigEndian) {
+    r = dst + (24 - redShift)/8;
+    g = dst + (24 - greenShift)/8;
+    b = dst + (24 - blueShift)/8;
+    x = dst + (24 - (48 - redShift - greenShift - blueShift))/8;
+  } else {
+    r = dst + redShift/8;
+    g = dst + greenShift/8;
+    b = dst + blueShift/8;
+    x = dst + (48 - redShift - greenShift - blueShift)/8;
+  }
+
+  dstPad = (dstStride - w) * 4;
+  srcPad = (srcStride - w);
+  while (h--) {
+    int w_ = w;
+    while (w_--) {
+      rdr::UIN s;
+
+      s = *src;
+
+#if INBPP != 8
+      if (srcPF.endianMismatch)
+        s = SWAPIN(s);
+#endif
+
+      *r = redUpTable[(s >> srcPF.redShift) & 0xff];
+      *g = greenUpTable[(s >> srcPF.greenShift) & 0xff];
+      *b = blueUpTable[(s >> srcPF.blueShift) & 0xff];
+      *x = 0;
+
+      r += 4;
+      g += 4;
+      b += 4;
+      x += 4;
+      src++;
+    }
+    r += dstPad;
+    g += dstPad;
+    b += dstPad;
+    x += dstPad;
+    src += srcPad;
+  }
+}
+
+#endif /* OUTBPP == 32 */
-- 
2.39.5