diff --git a/.github/workflows/ccp-workflow.yml b/.github/workflows/ccp-workflow.yml
index 3104ce51..e7fa60e8 100644
--- a/.github/workflows/ccp-workflow.yml
+++ b/.github/workflows/ccp-workflow.yml
@@ -13,6 +13,7 @@ on:
         options:
           - 'test_arm32'
           - 'test_s390x'
+          - 'test_ppc64le'
         default: 'test_arm32'
       log_level:
         description: 'Log level'
@@ -32,7 +33,7 @@ jobs:
   #       ]
   #   name: ${{ matrix.system }} Build
   #   runs-on: ${{ matrix.runner }}
-  #   if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' )}}
+  #   if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' || github.event.inputs.job_to_run != 'test_ppc64le' )}}
   #   steps:
   #   - uses: actions/checkout@v5
   #   - name: cmake
@@ -51,7 +52,7 @@ jobs:
         ]
     name: ${{ matrix.system }} Build
     runs-on: ${{ matrix.runner }}
-    if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' )}}
+    if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' || github.event.inputs.job_to_run != 'test_ppc64le' )}}
     steps:
     - uses: actions/checkout@v5
     - name: cmake
@@ -70,7 +71,7 @@ jobs:
   #       ]
   #   name: ${{ matrix.system }} Build
   #   runs-on: ${{ matrix.runner }}
-  #   if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' )}}
+  #   if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' || github.event.inputs.job_to_run != 'test_ppc64le' )}}
   #   steps:
   #   - uses: actions/checkout@v5
   #   - name: cmake
@@ -90,7 +91,7 @@ jobs:
   #       ]
   #   name: ${{ matrix.system }} Build
   #   runs-on: ${{ matrix.runner }}
-  #   if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' )}}
+  #   if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' || github.event.inputs.job_to_run != 'test_ppc64le' )}}
   #   defaults:
   #     run:
   #       shell: msys2 {0}
@@ -117,7 +118,7 @@ jobs:
   #       ]
   #   name: ${{ matrix.system }} Build
   #   runs-on: ${{ matrix.runner }}
-  #   if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' )}}
+  #   if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' || github.event.inputs.job_to_run != 'test_ppc64le' )}}
   #   steps:
   #   - uses: actions/checkout@v5
   #   - name: cmake
@@ -139,7 +140,7 @@ jobs:
         ]
     name: ${{ matrix.system }} Build and Test
     runs-on: ${{ matrix.runner }}
-    if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' )}}
+    if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' || github.event.inputs.job_to_run != 'test_ppc64le' )}}
     steps:
     - uses: actions/checkout@v5
     - name: cmake
@@ -161,7 +162,7 @@ jobs:
         ]
     name: ${{ matrix.system }} Build and Test
     runs-on: ${{ matrix.runner }}
-    if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' )}}
+    if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' || github.event.inputs.job_to_run != 'test_ppc64le' )}}
     steps:
     - uses: actions/checkout@v5
     - name: cmake
@@ -184,7 +185,7 @@ jobs:
         ]
     name: ${{ matrix.system }} Build and Test
     runs-on: ${{ matrix.runner }}
-    if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' )}}
+    if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' || github.event.inputs.job_to_run != 'test_ppc64le' )}}
     defaults:
       run:
         shell: msys2 {0}
@@ -214,7 +215,7 @@ jobs:
         ]
     name: ${{ matrix.system }} Build and Test
     runs-on: ${{ matrix.runner }}
-    if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' )}}
+    if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' || github.event.inputs.job_to_run != 'test_ppc64le' )}}
     steps:
     - uses: actions/checkout@v5
     - name: cmake
@@ -266,3 +267,31 @@ jobs:
           cmake -DCMAKE_BUILD_TYPE=Release -DOJPH_BUILD_STREAM_EXPAND=ON -DOJPH_ENABLE_TIFF_SUPPORT=OFF -DOJPH_BUILD_TESTS=ON ..
           make
           ctest --output-on-failure
+
+  test_powerpc64le:
+    name: Linux-ppc64le Build and Test
+    strategy:
+      fail-fast: false
+      matrix:
+        include: [
+          { dist: ubuntu22.04 },
+          { dist: ubuntu24.04 },
+          { dist: ubuntu_latest },
+        ]
+    runs-on: ubuntu-latest
+    if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.job_to_run == 'test_ppc64le' }}
+    steps:
+    - uses: actions/checkout@v5
+    - uses: uraimo/run-on-arch-action@v3
+      with:
+        arch: ppc64le
+        distro: ${{ matrix.dist }}
+        githubToken: ${{ github.token }}
+        install: |
+          apt-get update -q -y
+          apt-get install -q -y cmake make g++ libtiff-dev python3
+        run: |
+          cd build
+          cmake -DCMAKE_BUILD_TYPE=Release -DOJPH_BUILD_STREAM_EXPAND=ON -DOJPH_ENABLE_TIFF_SUPPORT=OFF -DOJPH_BUILD_TESTS=ON ..
+          make
+          ctest --output-on-failure
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 0a13d3b2..c2079ff0 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -5,11 +5,13 @@ file(GLOB CODESTREAM_SSE2  "codestream/*_sse2.cpp")
 file(GLOB CODESTREAM_AVX   "codestream/*_avx.cpp")
 file(GLOB CODESTREAM_AVX2  "codestream/*_avx2.cpp")
 file(GLOB CODESTREAM_WASM  "codestream/*_wasm.cpp")
+file(GLOB CODESTREAM_VSX   "codestream/*_vsx.cpp")
 file(GLOB CODING           "coding/*.cpp" "coding/*.h")
 file(GLOB CODING_SSSE3     "coding/*_ssse3.cpp")
 file(GLOB CODING_WASM      "coding/*_wasm.cpp")
 file(GLOB CODING_AVX2      "coding/*_avx2.cpp")
 file(GLOB CODING_AVX512    "coding/*_avx512.cpp")
+file(GLOB CODING_VSX       "coding/*_vsx.cpp")
 file(GLOB COMMON           "openjph/*.h")
 file(GLOB OTHERS           "others/*.cpp" "others/*.c")
 file(GLOB TRANSFORM        "transform/*.cpp" "transform/*.h")
@@ -19,10 +21,11 @@ file(GLOB TRANSFORM_AVX    "transform/*_avx.cpp")
 file(GLOB TRANSFORM_AVX2   "transform/*_avx2.cpp")
 file(GLOB TRANSFORM_AVX512 "transform/*_avx512.cpp")
 file(GLOB TRANSFORM_WASM   "transform/*_wasm.cpp")
+file(GLOB TRANSFORM_VSX    "transform/*_vsx.cpp")
 
-list(REMOVE_ITEM CODESTREAM ${CODESTREAM_SSE} ${CODESTREAM_SSE2} ${CODESTREAM_AVX} ${CODESTREAM_AVX2} ${CODESTREAM_WASM})
-list(REMOVE_ITEM CODING ${CODING_SSSE3} ${CODING_WASM} ${CODING_AVX2} ${CODING_AVX512})
-list(REMOVE_ITEM TRANSFORM ${TRANSFORM_SSE} ${TRANSFORM_SSE2} ${TRANSFORM_AVX} ${TRANSFORM_AVX2} ${TRANSFORM_AVX512} ${TRANSFORM_WASM})
+list(REMOVE_ITEM CODESTREAM ${CODESTREAM_SSE} ${CODESTREAM_SSE2} ${CODESTREAM_AVX} ${CODESTREAM_AVX2} ${CODESTREAM_WASM} ${CODESTREAM_VSX})
+list(REMOVE_ITEM CODING ${CODING_SSSE3} ${CODING_WASM} ${CODING_AVX2} ${CODING_AVX512} ${CODING_VSX})
+list(REMOVE_ITEM TRANSFORM ${TRANSFORM_SSE} ${TRANSFORM_SSE2} ${TRANSFORM_AVX} ${TRANSFORM_AVX2} ${TRANSFORM_AVX512} ${TRANSFORM_WASM} ${TRANSFORM_VSX})
 list(APPEND SOURCES ${CODESTREAM} ${CODING} ${COMMON} ${OTHERS} ${TRANSFORM})
 
 source_group("codestream"        FILES ${CODESTREAM})
@@ -112,6 +115,22 @@ else()
 
     endif()
 
+    if (("${OJPH_TARGET_ARCH}" MATCHES "OJPH_ARCH_PPC64")
+        AND (CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le|powerpc64le"))
+      # native 128-bit VSX kernels (see ojph_simd_vsx.h).  Supported
+      # targets are POWER9 (ISA 3.0) and newer, little-endian only.
+      # The block decoder dispatch is selective; see
+      # ojph_codeblock_fun.cpp.
+      list(APPEND SOURCES ${CODESTREAM_VSX} ${CODING_VSX} ${TRANSFORM_VSX})
+      source_group("codestream" FILES ${CODESTREAM_VSX})
+      source_group("coding" FILES ${CODING_VSX})
+      source_group("transform" FILES ${TRANSFORM_VSX})
+      set_source_files_properties(codestream/ojph_codestream_vsx.cpp PROPERTIES COMPILE_FLAGS "-mcpu=power9")
+      set_source_files_properties(coding/ojph_block_decoder_vsx.cpp PROPERTIES COMPILE_FLAGS "-mcpu=power9")
+      set_source_files_properties(transform/ojph_transform_vsx.cpp PROPERTIES COMPILE_FLAGS "-mcpu=power9")
+      set_source_files_properties(transform/ojph_colour_vsx.cpp PROPERTIES COMPILE_FLAGS "-mcpu=power9")
+    endif()
+
   endif()
 
 endif()
diff --git a/src/core/codestream/ojph_codeblock_fun.cpp b/src/core/codestream/ojph_codeblock_fun.cpp
index 2bf0d057..0c0345fc 100644
--- a/src/core/codestream/ojph_codeblock_fun.cpp
+++ b/src/core/codestream/ojph_codeblock_fun.cpp
@@ -61,16 +61,19 @@ namespace ojph {
     void sse_mem_clear(void* addr, size_t count);
     void avx_mem_clear(void* addr, size_t count);
     void wasm_mem_clear(void* addr, size_t count);
+    void vsx_mem_clear(void* addr, size_t count);
 
     //////////////////////////////////////////////////////////////////////////
     ui32  gen_find_max_val32(ui32* address);
     ui32 sse2_find_max_val32(ui32* address);
     ui32 avx2_find_max_val32(ui32* address);
     ui32 wasm_find_max_val32(ui32* address);
+    ui32 vsx_find_max_val32(ui32* address);
     ui64  gen_find_max_val64(ui64* address);
     ui64 sse2_find_max_val64(ui64* address);
     ui64 avx2_find_max_val64(ui64* address);
     ui64 wasm_find_max_val64(ui64* address);
+    ui64 vsx_find_max_val64(ui64* address);
 
 
     //////////////////////////////////////////////////////////////////////////
@@ -88,8 +91,12 @@ namespace ojph {
                              float delta_inv, ui32 count, ui32* max_val);
     void wasm_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
                              float delta_inv, ui32 count, ui32* max_val);
+    void vsx_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val);
     void wasm_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
                              float delta_inv, ui32 count, ui32* max_val);
+    void vsx_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val);
 
     void  gen_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max,
                              float delta_inv, ui32 count, ui64* max_val);
@@ -99,6 +106,8 @@ namespace ojph {
                              float delta_inv, ui32 count, ui64* max_val);
     void wasm_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max,
                              float delta_inv, ui32 count, ui64* max_val);
+    void vsx_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui64* max_val);
 
     //////////////////////////////////////////////////////////////////////////
     void  gen_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
@@ -115,8 +124,12 @@ namespace ojph {
                                float delta, ui32 count);
     void wasm_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
                                float delta, ui32 count);
+    void vsx_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
     void wasm_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
                                float delta, ui32 count);
+    void vsx_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
 
     void  gen_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max,
                                float delta, ui32 count);
@@ -128,6 +141,8 @@ namespace ojph {
                               float delta, ui32 count);
     void wasm_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max,
                                float delta, ui32 count);
+    void vsx_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
 
     void codeblock_fun::init(bool reversible) {
 
@@ -246,6 +261,40 @@ namespace ojph {
 
     #elif defined(OJPH_ARCH_ARM)
 
+    #elif defined(OJPH_ARCH_PPC64LE)
+
+      // 128-bit VSX kernels; see ojph_simd_vsx.h.
+      // The SIMD block decoder is used everywhere on POWER10 (ISA 3.1),
+      // where it beats the scalar decoder on all measured content.  On
+      // POWER9 it wins for irreversible content (more magnitude bits
+      // per sample) but trails the scalar decoder slightly on
+      // reversible content, so it is dispatched only for the former.
+      if (get_cpu_ext_level() >= PPC_CPU_EXT_LEVEL_ARCH_3_1 ||
+          (!reversible &&
+           get_cpu_ext_level() >= PPC_CPU_EXT_LEVEL_ARCH_3_00))
+        decode_cb32 = ojph_decode_codeblock_vsx;
+      if (get_cpu_ext_level() >= PPC_CPU_EXT_LEVEL_ARCH_3_00) {
+        find_max_val32 = vsx_find_max_val32;
+        mem_clear = vsx_mem_clear;
+        if (reversible) {
+          tx_to_cb32 = vsx_rev_tx_to_cb32;
+          tx_from_cb32 = vsx_rev_tx_from_cb32;
+        }
+        else {
+          tx_to_cb32 = vsx_irv_tx_to_cb32;
+          tx_from_cb32 = vsx_irv_tx_from_cb32;
+        }
+        find_max_val64 = vsx_find_max_val64;
+        if (reversible) {
+          tx_to_cb64 = vsx_rev_tx_to_cb64;
+          tx_from_cb64 = vsx_rev_tx_from_cb64;
+        }
+        else {
+          tx_to_cb64 = NULL;
+          tx_from_cb64 = gen_irv_tx_from_cb64;
+        }
+      }
+
     #endif // !(defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386))
 
   #endif // !OJPH_DISABLE_SIMD
diff --git a/src/core/codestream/ojph_codestream_local.cpp b/src/core/codestream/ojph_codestream_local.cpp
index bd523ec7..bd402844 100644
--- a/src/core/codestream/ojph_codestream_local.cpp
+++ b/src/core/codestream/ojph_codestream_local.cpp
@@ -640,7 +640,7 @@ namespace ojph {
       this->pre_alloc();
       this->finalize_alloc();
 
-      ui16 t = swap_bytes_if_le(JP2K_MARKER::SOC);
+      ui16 t = swap_bytes_if_le((ui16)JP2K_MARKER::SOC);
       if (file->write(&t, 2) != 2)
         OJPH_ERROR(0x00030022, "Error writing to file");
 
@@ -670,7 +670,7 @@ namespace ojph {
         OJPH_INT_TO_STRING(OPENJPH_VERSION_MINOR) "."
         OJPH_INT_TO_STRING(OPENJPH_VERSION_PATCH) ".";
       size_t len = strlen(buf);
-      *(ui16*)buf = swap_bytes_if_le(JP2K_MARKER::COM);
+      *(ui16*)buf = swap_bytes_if_le((ui16)JP2K_MARKER::COM);
       *(ui16*)(buf + 2) = swap_bytes_if_le((ui16)(len - 2));
       //1 for General use (IS 8859-15:1999 (Latin) values)
       *(ui16*)(buf + 4) = swap_bytes_if_le((ui16)(1));
@@ -680,7 +680,7 @@ namespace ojph {
       if (comments != NULL) {
         for (ui32 i = 0; i < num_comments; ++i)
         {
-          t = swap_bytes_if_le(JP2K_MARKER::COM);
+          t = swap_bytes_if_le((ui16)JP2K_MARKER::COM);
           if (file->write(&t, 2) != 2)
             OJPH_ERROR(0x00030029, "Error writing to file");
           t = swap_bytes_if_le((ui16)(comments[i].len + 4));
@@ -1151,7 +1151,7 @@ namespace ojph {
       }
       for (si32 i = 0; i < repeat; ++i)
         tiles[i].flush(outfile);
-      ui16 t = swap_bytes_if_le(JP2K_MARKER::EOC);
+      ui16 t = swap_bytes_if_le((ui16)JP2K_MARKER::EOC);
       if (!outfile->write(&t, 2))
         OJPH_ERROR(0x00030071, "Error writing to file");
     }
diff --git a/src/core/codestream/ojph_codestream_vsx.cpp b/src/core/codestream/ojph_codestream_vsx.cpp
new file mode 100644
index 00000000..9c668e00
--- /dev/null
+++ b/src/core/codestream/ojph_codestream_vsx.cpp
@@ -0,0 +1,289 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2022, Aous Naman
+// Copyright (c) 2022, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2022, The University of New South Wales, Australia
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: ojph_codestream_vsx.cpp
+// Author: Aous Naman
+// Date: 15 May 2022
+//***************************************************************************/
+
+#include <climits>
+#include <cstddef>
+#include "ojph_simd_vsx.h"
+
+#include "ojph_defs.h"
+
+namespace ojph {
+  namespace local {
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_mem_clear(void* addr, size_t count)
+    {
+      float* p = (float*)addr;
+      v128_t zero = vsx_i32x4_splat(0);
+      for (size_t i = 0; i < count; i += 16, p += 4)
+        vsx_v128_store(p, zero);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    ui32 vsx_find_max_val32(ui32* address)
+    {
+      v128_t x1, x0 = vsx_v128_load(address);
+      x1 = vsx_i32x4_shuffle(x0, x0, 2, 3, 2, 3);   // x1 = x0[2,3,2,3]
+      x0 = vsx_v128_or(x0, x1);
+      x1 = vsx_i32x4_shuffle(x0, x0, 1, 1, 1, 1);   // x1 = x0[1,1,1,1]
+      x0 = vsx_v128_or(x0, x1);
+      ui32 t = (ui32)vsx_i32x4_extract_lane(x0, 0);
+      return t;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    ui64 vsx_find_max_val64(ui64* address)
+    {
+      v128_t x1, x0 = vsx_v128_load(address);
+      x1 = vsx_i64x2_shuffle(x0, x0, 1, 1);   // x1 = x0[2,3,2,3]
+      x0 = vsx_v128_or(x0, x1);
+      ui64 t = (ui64)vsx_i64x2_extract_lane(x0, 0);
+      return t;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val)
+    {
+      ojph_unused(delta_inv);
+
+      // convert to sign and magnitude and keep max_val
+      ui32 shift = 31 - K_max;
+      v128_t m0 = vsx_i32x4_splat(INT_MIN);
+      v128_t zero = vsx_i32x4_splat(0);
+      v128_t one = vsx_i32x4_splat(1);
+      v128_t tmax = vsx_v128_load(max_val);
+      si32 *p = (si32*)sp;
+      for ( ; count >= 4; count -= 4, p += 4, dp += 4)
+      {
+        v128_t v = vsx_v128_load(p);
+        v128_t sign = vsx_i32x4_lt(v, zero);
+        v128_t val = vsx_v128_xor(v, sign); // negate 1's complement
+        v128_t ones = vsx_v128_and(sign, one);
+        val = vsx_i32x4_add(val, ones);     // 2's complement
+        sign = vsx_v128_and(sign, m0);
+        val = vsx_i32x4_shl(val, shift);
+        tmax = vsx_v128_or(tmax, val);
+        val = vsx_v128_or(val, sign);
+        vsx_v128_store(dp, val);
+      }
+      if (count)
+      {
+        v128_t v = vsx_v128_load(p);
+        v128_t sign = vsx_i32x4_lt(v, zero);
+        v128_t val = vsx_v128_xor(v, sign); // negate 1's complement
+        v128_t ones = vsx_v128_and(sign, one);
+        val = vsx_i32x4_add(val, ones);     // 2's complement
+        sign = vsx_v128_and(sign, m0);
+        val = vsx_i32x4_shl(val, shift);
+
+        v128_t c = vsx_i32x4_splat((si32)count);
+        v128_t idx = vsx_i32x4_make(0, 1, 2, 3);
+        v128_t mask = vsx_i32x4_gt(c, idx);
+        c = vsx_v128_and(val, mask);
+        tmax = vsx_v128_or(tmax, c);
+
+        val = vsx_v128_or(val, sign);
+        vsx_v128_store(dp, val);
+      }
+      vsx_v128_store(max_val, tmax);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val)
+    {
+      ojph_unused(K_max);
+
+      //quantize and convert to sign and magnitude and keep max_val
+
+      v128_t d = vsx_f32x4_splat(delta_inv);
+      v128_t zero = vsx_i32x4_splat(0);
+      v128_t one = vsx_i32x4_splat(1);
+      v128_t tmax = vsx_v128_load(max_val);
+      float *p = (float*)sp;
+      for ( ; count >= 4; count -= 4, p += 4, dp += 4)
+      {
+        v128_t vf = vsx_v128_load(p);
+        vf = vsx_f32x4_mul(vf, d);                   // multiply
+        v128_t val = vsx_i32x4_trunc_sat_f32x4(vf);  // convert to signed int
+        v128_t sign = vsx_i32x4_lt(val, zero);       // get sign
+        val = vsx_v128_xor(val, sign);               // negate 1's complement
+        v128_t ones = vsx_v128_and(sign, one);
+        val = vsx_i32x4_add(val, ones);              // 2's complement
+        tmax = vsx_v128_or(tmax, val);
+        sign = vsx_i32x4_shl(sign, 31);
+        val = vsx_v128_or(val, sign);
+        vsx_v128_store(dp, val);
+      }
+      if (count)
+      {
+        v128_t vf = vsx_v128_load(p);
+        vf = vsx_f32x4_mul(vf, d);                   // multiply
+        v128_t val = vsx_i32x4_trunc_sat_f32x4(vf);  // convert to signed int
+        v128_t sign = vsx_i32x4_lt(val, zero);       // get sign
+        val = vsx_v128_xor(val, sign);               // negate 1's complement
+        v128_t ones = vsx_v128_and(sign, one);
+        val = vsx_i32x4_add(val, ones);              // 2's complement
+
+        v128_t c = vsx_i32x4_splat((si32)count);
+        v128_t idx = vsx_i32x4_make(0, 1, 2, 3);
+        v128_t mask = vsx_i32x4_gt(c, idx);
+        c = vsx_v128_and(val, mask);
+        tmax = vsx_v128_or(tmax, c);
+
+        sign = vsx_i32x4_shl(sign, 31);
+        val = vsx_v128_or(val, sign);
+        vsx_v128_store(dp, val);
+      }
+      vsx_v128_store(max_val, tmax);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count)
+    {
+      ojph_unused(delta);
+      ui32 shift = 31 - K_max;
+      v128_t m1 = vsx_i32x4_splat(INT_MAX);
+      v128_t zero = vsx_i32x4_splat(0);
+      v128_t one = vsx_i32x4_splat(1);
+      si32 *p = (si32*)dp;
+      for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
+      {
+          v128_t v = vsx_v128_load((v128_t*)sp);
+          v128_t val = vsx_v128_and(v, m1);
+          val = vsx_i32x4_shr(val, shift);
+          v128_t sign = vsx_i32x4_lt(v, zero);
+          val = vsx_v128_xor(val, sign); // negate 1's complement
+          v128_t ones = vsx_v128_and(sign, one);
+          val = vsx_i32x4_add(val, ones); // 2's complement
+          vsx_v128_store(p, val);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count)
+    {
+      ojph_unused(K_max);
+      v128_t m1 = vsx_i32x4_splat(INT_MAX);
+      v128_t d = vsx_f32x4_splat(delta);
+      float *p = (float*)dp;
+      for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
+      {
+        v128_t v = vsx_v128_load((v128_t*)sp);
+        v128_t vali = vsx_v128_and(v, m1);
+        v128_t  valf = vsx_f32x4_convert_i32x4(vali);
+        valf = vsx_f32x4_mul(valf, d);
+        v128_t sign = vsx_v128_andnot(v, m1);
+        valf = vsx_v128_or(valf, sign);
+        vsx_v128_store(p, valf);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui64* max_val)
+    {
+      ojph_unused(delta_inv);
+
+      // convert to sign and magnitude and keep max_val
+      ui32 shift = 63 - K_max;
+      v128_t m0 = vsx_i64x2_splat(LLONG_MIN);
+      v128_t zero = vsx_i64x2_splat(0);
+      v128_t one = vsx_i64x2_splat(1);
+      v128_t tmax = vsx_v128_load(max_val);
+      si64 *p = (si64*)sp;
+      for ( ; count >= 2; count -= 2, p += 2, dp += 2)
+      {
+        v128_t v = vsx_v128_load(p);
+        v128_t sign = vsx_i64x2_lt(v, zero);
+        v128_t val = vsx_v128_xor(v, sign); // negate 1's complement
+        v128_t ones = vsx_v128_and(sign, one);
+        val = vsx_i64x2_add(val, ones);     // 2's complement
+        sign = vsx_v128_and(sign, m0);
+        val = vsx_i64x2_shl(val, shift);
+        tmax = vsx_v128_or(tmax, val);
+        val = vsx_v128_or(val, sign);
+        vsx_v128_store(dp, val);
+      }
+      if (count)
+      {
+        v128_t v = vsx_v128_load(p);
+        v128_t sign = vsx_i64x2_lt(v, zero);
+        v128_t val = vsx_v128_xor(v, sign); // negate 1's complement
+        v128_t ones = vsx_v128_and(sign, one);
+        val = vsx_i64x2_add(val, ones);     // 2's complement
+        sign = vsx_v128_and(sign, m0);
+        val = vsx_i64x2_shl(val, shift);
+
+        v128_t c = vsx_i32x4_make((si32)0xFFFFFFFF, (si32)0xFFFFFFFF, 0, 0);
+        c = vsx_v128_and(val, c);
+        tmax = vsx_v128_or(tmax, c);
+
+        val = vsx_v128_or(val, sign);
+        vsx_v128_store(dp, val);
+      }
+
+      vsx_v128_store(max_val, tmax);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count)
+    {
+      ojph_unused(delta);
+      ui32 shift = 63 - K_max;
+      v128_t m1 = vsx_i64x2_splat(LLONG_MAX);
+      v128_t zero = vsx_i64x2_splat(0);
+      v128_t one = vsx_i64x2_splat(1);
+      si64 *p = (si64*)dp;
+      for (ui32 i = 0; i < count; i += 2, sp += 2, p += 2)
+      {
+          v128_t v = vsx_v128_load((v128_t*)sp);
+          v128_t val = vsx_v128_and(v, m1);
+          val = vsx_i64x2_shr(val, shift);
+          v128_t sign = vsx_i64x2_lt(v, zero);
+          val = vsx_v128_xor(val, sign); // negate 1's complement
+          v128_t ones = vsx_v128_and(sign, one);
+          val = vsx_i64x2_add(val, ones); // 2's complement
+          vsx_v128_store(p, val);
+      }
+    }
+  }
+}
diff --git a/src/core/codestream/ojph_tile.cpp b/src/core/codestream/ojph_tile.cpp
index 9536c608..b866fd4c 100644
--- a/src/core/codestream/ojph_tile.cpp
+++ b/src/core/codestream/ojph_tile.cpp
@@ -594,7 +594,7 @@ namespace ojph {
           OJPH_ERROR(0x00030081, "Error writing to file");
 
         //write start of data
-        ui16 t = swap_bytes_if_le(JP2K_MARKER::SOD);
+        ui16 t = swap_bytes_if_le((ui16)JP2K_MARKER::SOD);
         if (!file->write(&t, 2))
           OJPH_ERROR(0x00030082, "Error writing to file");
       }
@@ -622,7 +622,7 @@ namespace ojph {
               OJPH_ERROR(0x00030083, "Error writing to file");
 
             //write start of data
-            ui16 t = swap_bytes_if_le(JP2K_MARKER::SOD);
+            ui16 t = swap_bytes_if_le((ui16)JP2K_MARKER::SOD);
             if (!file->write(&t, 2))
               OJPH_ERROR(0x00030084, "Error writing to file");
 
@@ -642,7 +642,7 @@ namespace ojph {
                                (ui8)(c + r * num_comps), (ui8)num_tileparts))
                   OJPH_ERROR(0x00030085, "Error writing to file");
                 //write start of data
-                ui16 t = swap_bytes_if_le(JP2K_MARKER::SOD);
+                ui16 t = swap_bytes_if_le((ui16)JP2K_MARKER::SOD);
                 if (!file->write(&t, 2))
                   OJPH_ERROR(0x00030086, "Error writing to file");
                 comps[c].write_precincts(r, file);
@@ -663,7 +663,7 @@ namespace ojph {
               OJPH_ERROR(0x00030087, "Error writing to file");
 
             //write start of data
-            ui16 t = swap_bytes_if_le(JP2K_MARKER::SOD);
+            ui16 t = swap_bytes_if_le((ui16)JP2K_MARKER::SOD);
             if (!file->write(&t, 2))
               OJPH_ERROR(0x00030088, "Error writing to file");
           }
@@ -738,7 +738,7 @@ namespace ojph {
               OJPH_ERROR(0x0003008A, "Error writing to file");
 
             //write start of data
-            ui16 t = swap_bytes_if_le(JP2K_MARKER::SOD);
+            ui16 t = swap_bytes_if_le((ui16)JP2K_MARKER::SOD);
             if (!file->write(&t, 2))
               OJPH_ERROR(0x0003008B, "Error writing to file");
           }
diff --git a/src/core/coding/ojph_block_decoder.h b/src/core/coding/ojph_block_decoder.h
index a1970174..9b7bb19e 100644
--- a/src/core/coding/ojph_block_decoder.h
+++ b/src/core/coding/ojph_block_decoder.h
@@ -77,6 +77,12 @@ namespace ojph {
         ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2,
         ui32 width, ui32 height, ui32 stride, bool stripe_causal);
 
+    // POWER VSX-accelerated decoder
+    bool
+      ojph_decode_codeblock_vsx(ui8* coded_data, ui32* decoded_data,
+        ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2,
+        ui32 width, ui32 height, ui32 stride, bool stripe_causal);
+
   }
 }
 
diff --git a/src/core/coding/ojph_block_decoder_vsx.cpp b/src/core/coding/ojph_block_decoder_vsx.cpp
new file mode 100644
index 00000000..4ddcde08
--- /dev/null
+++ b/src/core/coding/ojph_block_decoder_vsx.cpp
@@ -0,0 +1,2226 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2022, Aous Naman
+// Copyright (c) 2022, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2022, The University of New South Wales, Australia
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: ojph_block_decoder_vsx.cpp
+// Author: Aous Naman
+// Date: 13 May 2022
+//***************************************************************************/
+
+//***************************************************************************/
+/** @file ojph_block_decoder_vsx.cpp
+ *  @brief implements a faster HTJ2K block decoder using POWER VSX simd
+ */
+
+#include <string>
+#include <iostream>
+
+#include <cassert>
+#include <cstring>
+#include "ojph_block_common.h"
+#include "ojph_block_decoder.h"
+#include "ojph_arch.h"
+#include "ojph_message.h"
+
+#include "ojph_simd_vsx.h"
+
+namespace ojph {
+  namespace local {
+
+    //************************************************************************/
+    /** @brief Macros that help with typing and space
+     */
+    #define OJPH_REPEAT2(a) a,a
+    #define OJPH_REPEAT4(a) a,a,a,a
+    #define OJPH_REPEAT8(a) a,a,a,a,a,a,a,a
+    #define OJPH_REPEAT16(a) a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a
+
+    //************************************************************************/
+    /** @brief MEL state structure for reading and decoding the MEL bitstream
+     *
+     *  A number of events is decoded from the MEL bitstream ahead of time
+     *  and stored in run/num_runs.
+     *  Each run represents the number of zero events before a one event.
+     */
+    struct dec_mel_st {
+      dec_mel_st() : data(NULL), tmp(0), bits(0), size(0), unstuff(false),
+        k(0), num_runs(0), runs(0)
+      {}
+      // data decoding machinery
+      ui8* data;    //!<the address of data (or bitstream)
+      ui64 tmp;     //!<temporary buffer for read data
+      int bits;     //!<number of bits stored in tmp
+      int size;     //!<number of bytes in MEL code
+      bool unstuff; //!<true if the next bit needs to be unstuffed
+      int k;        //!<state of MEL decoder
+
+      // queue of decoded runs
+      int num_runs; //!<number of decoded runs left in runs (maximum 8)
+      ui64 runs;    //!<runs of decoded MEL codewords (7 bits/run)
+    };
+
+    //************************************************************************/
+    /** @brief Reads and unstuffs the MEL bitstream
+     *
+     *  This design needs more bytes in the codeblock buffer than the length
+     *  of the cleanup pass by up to 2 bytes.
+     *
+     *  Unstuffing removes the MSB of the byte following a byte whose
+     *  value is 0xFF; this prevents sequences larger than 0xFF7F in value
+     *  from appearing the bitstream.
+     *
+     *  @param [in]  melp is a pointer to dec_mel_st structure
+     */
+    static inline
+    void mel_read(dec_mel_st *melp)
+    {
+      if (melp->bits > 32)  //there are enough bits in the tmp variable
+        return;             // return without reading new data
+
+      ui32 val = 0xFFFFFFFF;       // feed in 0xFF if buffer is exhausted
+      if (melp->size > 4) {        // if there is data in the MEL segment
+        memcpy(&val, melp->data, sizeof(val)); // read 32 bits from MEL data
+        melp->data += 4;           // advance pointer
+        melp->size -= 4;           // reduce counter
+      }
+      else if (melp->size > 0)
+      { // 4 or less
+        int i = 0;
+        while (melp->size > 1) {
+          ui32 v = *melp->data++;    // read one byte at a time
+          ui32 m = ~(0xFFu << i);    // mask of location
+          val = (val & m) | (v << i);// put one byte in its correct location
+          --melp->size;
+          i += 8;
+        }
+        // size equal to 1
+        ui32 v = *melp->data++;    // the one before the last is different
+        v |= 0xF;                  // MEL and VLC segments can overlap
+        ui32 m = ~(0xFFu << i);
+        val = (val & m) | (v << i);
+        --melp->size;
+      }
+
+      // next we unstuff them before adding them to the buffer
+      int bits = 32 - melp->unstuff; // number of bits in val, subtract 1 if
+                                     // the previously read byte requires
+                                     // unstuffing
+
+      // data is unstuffed and accumulated in t
+      // bits has the number of bits in t
+      ui32 t = val & 0xFF;
+      bool unstuff = ((val & 0xFF) == 0xFF); // true if we need unstuffing
+      bits -= unstuff; // there is one less bit in t if unstuffing is needed
+      t = t << (8 - unstuff); // move up to make room for the next byte
+
+      //this is a repeat of the above
+      t |= (val>>8) & 0xFF;
+      unstuff = (((val >> 8) & 0xFF) == 0xFF);
+      bits -= unstuff;
+      t = t << (8 - unstuff);
+
+      t |= (val>>16) & 0xFF;
+      unstuff = (((val >> 16) & 0xFF) == 0xFF);
+      bits -= unstuff;
+      t = t << (8 - unstuff);
+
+      t |= (val>>24) & 0xFF;
+      melp->unstuff = (((val >> 24) & 0xFF) == 0xFF);
+
+      // move t to tmp, and push the result all the way up, so we read from
+      // the MSB
+      melp->tmp |= ((ui64)t) << (64 - bits - melp->bits);
+      melp->bits += bits; //increment the number of bits in tmp
+    }
+
+    //************************************************************************/
+    /** @brief Decodes unstuffed MEL segment bits stored in tmp to runs
+     *
+     *  Runs are stored in "runs" and the number of runs in "num_runs".
+     *  Each run represents a number of zero events that may or may not
+     *  terminate in a 1 event.
+     *  Each run is stored in 7 bits.  The LSB is 1 if the run terminates in
+     *  a 1 event, 0 otherwise.  The next 6 bits, for the case terminating
+     *  with 1, contain the number of consecutive 0 zero events * 2; for the
+     *  case terminating with 0, they store (number of consecutive 0 zero
+     *  events - 1) * 2.
+     *  A total of 6 bits (made up of 1 + 5) should have been enough.
+     *
+     *  @param [in]  melp is a pointer to dec_mel_st structure
+     */
+    static inline
+    void mel_decode(dec_mel_st *melp)
+    {
+      static const int mel_exp[13] = { //MEL exponents
+        0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5
+      };
+
+      if (melp->bits < 6) // if there are less than 6 bits in tmp
+        mel_read(melp);   // then read from the MEL bitstream
+                          // 6 bits is the largest decodable MEL cwd
+
+      //repeat so long that there is enough decodable bits in tmp,
+      // and the runs store is not full (num_runs < 8)
+      while (melp->bits >= 6 && melp->num_runs < 8)
+      {
+        int eval = mel_exp[melp->k]; // number of bits associated with state
+        int run = 0;
+        if (melp->tmp & (1ull<<63)) //The next bit to decode (stored in MSB)
+        { //one is found
+          run = 1 << eval;
+          run--; // consecutive runs of 0 events - 1
+          melp->k = melp->k + 1 < 12 ? melp->k + 1 : 12;//increment, max is 12
+          melp->tmp <<= 1; // consume one bit from tmp
+          melp->bits -= 1;
+          run = run << 1; // a stretch of zeros not terminating in one
+        }
+        else
+        { //0 is found
+          run = (int)(melp->tmp >> (63 - eval)) & ((1 << eval) - 1);
+          melp->k = melp->k - 1 > 0 ? melp->k - 1 : 0; //decrement, min is 0
+          melp->tmp <<= eval + 1; //consume eval + 1 bits (max is 6)
+          melp->bits -= eval + 1;
+          run = (run << 1) + 1; // a stretch of zeros terminating with one
+        }
+        eval = melp->num_runs * 7;           // 7 bits per run
+        melp->runs &= ~((ui64)0x3F << eval); // 6 bits are sufficient
+        melp->runs |= ((ui64)run) << eval;   // store the value in runs
+        melp->num_runs++;                    // increment count
+      }
+    }
+
+    //************************************************************************/
+    /** @brief Initiates a dec_mel_st structure for MEL decoding and reads
+     *         some bytes in order to get the read address to a multiple
+     *         of 4
+     *
+     *  @param [in]  melp is a pointer to dec_mel_st structure
+     *  @param [in]  bbuf is a pointer to byte buffer
+     *  @param [in]  lcup is the length of MagSgn+MEL+VLC segments
+     *  @param [in]  scup is the length of MEL+VLC segments
+     */
+    static inline
+    void mel_init(dec_mel_st *melp, ui8* bbuf, int lcup, int scup)
+    {
+      melp->data = bbuf + lcup - scup; // move the pointer to the start of MEL
+      melp->bits = 0;                  // 0 bits in tmp
+      melp->tmp = 0;                   //
+      melp->unstuff = false;           // no unstuffing
+      melp->size = scup - 1;           // size is the length of MEL+VLC-1
+      melp->k = 0;                     // 0 for state
+      melp->num_runs = 0;              // num_runs is 0
+      melp->runs = 0;                  //
+
+      //This code is borrowed; original is for a different architecture
+      //These few lines take care of the case where data is not at a multiple
+      // of 4 boundary.  It reads 1,2,3 up to 4 bytes from the MEL segment
+      int num = 4 - (int)(intptr_t(melp->data) & 0x3);
+      for (int i = 0; i < num; ++i) { // this code is similar to mel_read
+        assert(melp->unstuff == false || melp->data[0] <= 0x8F);
+        ui64 d = (melp->size > 0) ? *melp->data : 0xFF;//if buffer is consumed
+                                                       //set data to 0xFF
+        if (melp->size == 1) d |= 0xF; //if this is MEL+VLC-1, set LSBs to 0xF
+                                       // see the standard
+        melp->data += melp->size-- > 0; //increment if the end is not reached
+        int d_bits = 8 - melp->unstuff; //if unstuffing is needed, reduce by 1
+        melp->tmp = (melp->tmp << d_bits) | d; //store bits in tmp
+        melp->bits += d_bits;  //increment tmp by number of bits
+        melp->unstuff = ((d & 0xFF) == 0xFF); //true of next byte needs
+                                              //unstuffing
+      }
+      melp->tmp <<= (64 - melp->bits); //push all the way up so the first bit
+                                       // is the MSB
+    }
+
+    //************************************************************************/
+    /** @brief Retrieves one run from dec_mel_st; if there are no runs stored
+     *         MEL segment is decoded
+     *
+     * @param [in]  melp is a pointer to dec_mel_st structure
+     */
+    static inline
+    int mel_get_run(dec_mel_st *melp)
+    {
+      if (melp->num_runs == 0)  //if no runs, decode more bit from MEL segment
+        mel_decode(melp);
+
+      int t = melp->runs & 0x7F; //retrieve one run
+      melp->runs >>= 7;  // remove the retrieved run
+      melp->num_runs--;
+      return t; // return run
+    }
+
+    //************************************************************************/
+    /** @brief A structure for reading and unstuffing a segment that grows
+     *         backward, such as VLC and MRP
+     */
+    struct rev_struct {
+      rev_struct() : data(NULL), tmp(0), bits(0), size(0), unstuff(false)
+      {}
+      //storage
+      ui8* data;     //!<pointer to where to read data
+      ui64 tmp;	     //!<temporary buffer of read data
+      ui32 bits;     //!<number of bits stored in tmp
+      int size;      //!<number of bytes left
+      bool unstuff;  //!<true if the last byte is more than 0x8F
+                     //!<then the current byte is unstuffed if it is 0x7F
+    };
+
+    //************************************************************************/
+    /** @brief Read and unstuff data from a backwardly-growing segment
+     *
+     *  This reader can read up to 8 bytes from before the VLC segment.
+     *  Care must be taken not read from unreadable memory, causing a
+     *  segmentation fault.
+     *
+     *  Note that there is another subroutine rev_read_mrp that is slightly
+     *  different.  The other one fills zeros when the buffer is exhausted.
+     *  This one basically does not care if the bytes are consumed, because
+     *  any extra data should not be used in the actual decoding.
+     *
+     *  Unstuffing is needed to prevent sequences more than 0xFF8F from
+     *  appearing in the bits stream; since we are reading backward, we keep
+     *  watch when a value larger than 0x8F appears in the bitstream.
+     *  If the byte following this is 0x7F, we unstuff this byte (ignore the
+     *  MSB of that byte, which should be 0).
+     *
+     *  @param [in]  vlcp is a pointer to rev_struct structure
+     */
+    static inline
+    void rev_read(rev_struct *vlcp)
+    {
+      //process 4 bytes at a time
+      if (vlcp->bits > 32)  // if there are more than 32 bits in tmp, then
+        return;             // reading 32 bits can overflow vlcp->tmp
+      ui32 val = 0;
+      //the next line (the if statement) needs to be tested first
+      if (vlcp->size > 3)  // if there are more than 3 bytes left in VLC
+      {
+        // (vlcp->data - 3) move pointer back to read 32 bits at once
+        memcpy(&val, vlcp->data - 3, sizeof(val)); // then read 32 bits
+        vlcp->data -= 4;          // move data pointer back by 4
+        vlcp->size -= 4;          // reduce available byte by 4
+      }
+      else if (vlcp->size > 0)
+      { // 4 or less
+        int i = 24;
+        while (vlcp->size > 0) {
+          ui32 v = *vlcp->data--; // read one byte at a time
+          val |= (v << i);        // put byte in its correct location
+          --vlcp->size;
+          i -= 8;
+        }
+      }
+
+      //accumulate in tmp, number of bits in tmp are stored in bits
+      ui32 tmp = val >> 24;  //start with the MSB byte
+      ui32 bits;
+
+      // test unstuff (previous byte is >0x8F), and this byte is 0x7F
+      bits = 8 - ((vlcp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0);
+      bool unstuff = (val >> 24) > 0x8F; //this is for the next byte
+
+      tmp |= ((val >> 16) & 0xFF) << bits; //process the next byte
+      bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0);
+      unstuff = ((val >> 16) & 0xFF) > 0x8F;
+
+      tmp |= ((val >> 8) & 0xFF) << bits;
+      bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0);
+      unstuff = ((val >> 8) & 0xFF) > 0x8F;
+
+      tmp |= (val & 0xFF) << bits;
+      bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0);
+      unstuff = (val & 0xFF) > 0x8F;
+
+      // now move the read and unstuffed bits into vlcp->tmp
+      vlcp->tmp |= (ui64)tmp << vlcp->bits;
+      vlcp->bits += bits;
+      vlcp->unstuff = unstuff; // this for the next read
+    }
+
+    //************************************************************************/
+    /** @brief Initiates the rev_struct structure and reads a few bytes to
+     *         move the read address to multiple of 4
+     *
+     *  There is another similar rev_init_mrp subroutine.  The difference is
+     *  that this one, rev_init, discards the first 12 bits (they have the
+     *  sum of the lengths of VLC and MEL segments), and first unstuff depends
+     *  on first 4 bits.
+     *
+     *  @param [in]  vlcp is a pointer to rev_struct structure
+     *  @param [in]  data is a pointer to byte at the start of the cleanup pass
+     *  @param [in]  lcup is the length of MagSgn+MEL+VLC segments
+     *  @param [in]  scup is the length of MEL+VLC segments
+     */
+    static inline
+    void rev_init(rev_struct *vlcp, ui8* data, int lcup, int scup)
+    {
+      //first byte has only the upper 4 bits
+      vlcp->data = data + lcup - 2;
+
+      //size can not be larger than this, in fact it should be smaller
+      vlcp->size = scup - 2;
+
+      ui32 d = *vlcp->data--; // read one byte (this is a half byte)
+      vlcp->tmp = d >> 4;    // both initialize and set
+      vlcp->bits = 4 - ((vlcp->tmp & 7) == 7); //check standard
+      vlcp->unstuff = (d | 0xF) > 0x8F; //this is useful for the next byte
+
+      //This code is designed for an architecture that read address should
+      // align to the read size (address multiple of 4 if read size is 4)
+      //These few lines take care of the case where data is not at a multiple
+      // of 4 boundary. It reads 1,2,3 up to 4 bytes from the VLC bitstream.
+      // To read 32 bits, read from (vlcp->data - 3)
+      int num = 1 + (int)(intptr_t(vlcp->data) & 0x3);
+      int tnum = num < vlcp->size ? num : vlcp->size;
+      for (int i = 0; i < tnum; ++i) {
+        ui64 d;
+        d = *vlcp->data--;  // read one byte and move read pointer
+        //check if the last byte was >0x8F (unstuff == true) and this is 0x7F
+        ui32 d_bits = 8 - ((vlcp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
+        vlcp->tmp |= d << vlcp->bits; // move data to vlcp->tmp
+        vlcp->bits += d_bits;
+        vlcp->unstuff = d > 0x8F; // for next byte
+      }
+      vlcp->size -= tnum;
+      rev_read(vlcp);  // read another 32 buts
+    }
+
+    //************************************************************************/
+    /** @brief Retrieves 32 bits from the head of a rev_struct structure
+     *
+     *  By the end of this call, vlcp->tmp must have no less than 33 bits
+     *
+     *  @param [in]  vlcp is a pointer to rev_struct structure
+     */
+    static inline
+    ui32 rev_fetch(rev_struct *vlcp)
+    {
+      if (vlcp->bits < 32)  // if there are less then 32 bits, read more
+      {
+        rev_read(vlcp);     // read 32 bits, but unstuffing might reduce this
+        if (vlcp->bits < 32)// if there is still space in vlcp->tmp for 32 bits
+          rev_read(vlcp);   // read another 32
+      }
+      return (ui32)vlcp->tmp; // return the head (bottom-most) of vlcp->tmp
+    }
+
+    //************************************************************************/
+    /** @brief Consumes num_bits from a rev_struct structure
+     *
+     *  @param [in]  vlcp is a pointer to rev_struct structure
+     *  @param [in]  num_bits is the number of bits to be removed
+     */
+    static inline
+    ui32 rev_advance(rev_struct *vlcp, ui32 num_bits)
+    {
+      assert(num_bits <= vlcp->bits); // vlcp->tmp must have more than num_bits
+      vlcp->tmp >>= num_bits;         // remove bits
+      vlcp->bits -= num_bits;         // decrement the number of bits
+      return (ui32)vlcp->tmp;
+    }
+
+    //************************************************************************/
+    /** @brief Reads and unstuffs from rev_struct
+     *
+     *  This is different than rev_read in that this fills in zeros when the
+     *  the available data is consumed.  The other does not care about the
+     *  values when all data is consumed.
+     *
+     *  See rev_read for more information about unstuffing
+     *
+     *  @param [in]  mrp is a pointer to rev_struct structure
+     */
+    static inline
+    void rev_read_mrp(rev_struct *mrp)
+    {
+      //process 4 bytes at a time
+      if (mrp->bits > 32)
+        return;
+      ui32 val = 0;
+      if (mrp->size > 3) // If there are 3 byte or more
+      { // (mrp->data - 3) move pointer back to read 32 bits at once
+        memcpy(&val, mrp->data - 3, sizeof(val)); // read 32 bits
+        mrp->data -= 4;                // move back pointer
+        mrp->size -= 4;                // reduce count
+      }
+      else if (mrp->size > 0)
+      {
+        int i = 24;
+        while (mrp->size > 0) {
+          ui32 v = *mrp->data--; // read one byte at a time
+          val |= (v << i);       // put byte in its correct location
+          --mrp->size;
+          i -= 8;
+        }
+      }
+
+      //accumulate in tmp, and keep count in bits
+      ui32 bits, tmp = val >> 24;
+
+      //test if the last byte > 0x8F (unstuff must be true) and this is 0x7F
+      bits = 8 - ((mrp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0);
+      bool unstuff = (val >> 24) > 0x8F;
+
+      //process the next byte
+      tmp |= ((val >> 16) & 0xFF) << bits;
+      bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0);
+      unstuff = ((val >> 16) & 0xFF) > 0x8F;
+
+      tmp |= ((val >> 8) & 0xFF) << bits;
+      bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0);
+      unstuff = ((val >> 8) & 0xFF) > 0x8F;
+
+      tmp |= (val & 0xFF) << bits;
+      bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0);
+      unstuff = (val & 0xFF) > 0x8F;
+
+      mrp->tmp |= (ui64)tmp << mrp->bits; // move data to mrp pointer
+      mrp->bits += bits;
+      mrp->unstuff = unstuff;             // next byte
+    }
+
+    //************************************************************************/
+    /** @brief Initialized rev_struct structure for MRP segment, and reads
+     *         a number of bytes such that the next 32 bits read are from
+     *         an address that is a multiple of 4. Note this is designed for
+     *         an architecture that read size must be compatible with the
+     *         alignment of the read address
+     *
+     *  There is another similar subroutine rev_init.  This subroutine does
+     *  NOT skip the first 12 bits, and starts with unstuff set to true.
+     *
+     *  @param [in]  mrp is a pointer to rev_struct structure
+     *  @param [in]  data is a pointer to byte at the start of the cleanup pass
+     *  @param [in]  lcup is the length of MagSgn+MEL+VLC segments
+     *  @param [in]  len2 is the length of SPP+MRP segments
+     */
+    static inline
+    void rev_init_mrp(rev_struct *mrp, ui8* data, int lcup, int len2)
+    {
+      mrp->data = data + lcup + len2 - 1;
+      mrp->size = len2;
+      mrp->unstuff = true;
+      mrp->bits = 0;
+      mrp->tmp = 0;
+
+      //This code is designed for an architecture that read address should
+      // align to the read size (address multiple of 4 if read size is 4)
+      //These few lines take care of the case where data is not at a multiple
+      // of 4 boundary.  It reads 1,2,3 up to 4 bytes from the MRP stream
+      int num = 1 + (int)(intptr_t(mrp->data) & 0x3);
+      for (int i = 0; i < num; ++i) {
+        ui64 d;
+        //read a byte, 0 if no more data
+        d = (mrp->size-- > 0) ? *mrp->data-- : 0;
+        //check if unstuffing is needed
+        ui32 d_bits = 8 - ((mrp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
+        mrp->tmp |= d << mrp->bits; // move data to vlcp->tmp
+        mrp->bits += d_bits;
+        mrp->unstuff = d > 0x8F; // for next byte
+      }
+      rev_read_mrp(mrp);
+    }
+
+    //************************************************************************/
+    /** @brief Retrieves 32 bits from the head of a rev_struct structure
+     *
+     *  By the end of this call, mrp->tmp must have no less than 33 bits
+     *
+     *  @param [in]  mrp is a pointer to rev_struct structure
+     */
+    static inline
+    ui32 rev_fetch_mrp(rev_struct *mrp)
+    {
+      if (mrp->bits < 32) // if there are less than 32 bits in mrp->tmp
+      {
+        rev_read_mrp(mrp);    // read 30-32 bits from mrp
+        if (mrp->bits < 32)   // if there is a space of 32 bits
+          rev_read_mrp(mrp);  // read more
+      }
+      return (ui32)mrp->tmp;  // return the head of mrp->tmp
+    }
+
+    //************************************************************************/
+    /** @brief Consumes num_bits from a rev_struct structure
+     *
+     *  @param [in]  mrp is a pointer to rev_struct structure
+     *  @param [in]  num_bits is the number of bits to be removed
+     */
+    inline ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits)
+    {
+      assert(num_bits <= mrp->bits); // we must not consume more than mrp->bits
+      mrp->tmp >>= num_bits;  // discard the lowest num_bits bits
+      mrp->bits -= num_bits;
+      return (ui32)mrp->tmp;  // return data after consumption
+    }
+
+    //************************************************************************/
+    /** @brief State structure for reading and unstuffing of forward-growing
+     *         bitstreams; these are: MagSgn and SPP bitstreams
+     */
+    struct frwd_struct {
+      const ui8* data;  //!<pointer to bitstream
+      ui8 tmp[48];      //!<temporary buffer of read data + 16 extra
+      ui32 bits;        //!<number of bits stored in tmp
+      ui32 unstuff;     //!<1 if a bit needs to be unstuffed from next byte
+      int size;         //!<size of data
+    };
+
+    //************************************************************************/
+    /** @brief Read and unstuffs 16 bytes from forward-growing bitstream
+     *
+     *  A template is used to accommodate a different requirement for
+     *  MagSgn and SPP bitstreams; in particular, when MagSgn bitstream is
+     *  consumed, 0xFF's are fed, while when SPP is exhausted 0's are fed in.
+     *  X controls this value.
+     *
+     *  Unstuffing prevent sequences that are more than 0xFF7F from appearing
+     *  in the conpressed sequence.  So whenever a value of 0xFF is coded, the
+     *  MSB of the next byte is set 0 and must be ignored during decoding.
+     *
+     *  Reading can go beyond the end of buffer by up to 16 bytes.
+     *
+     *  @tparam       X is the value fed in when the bitstream is exhausted
+     *  @param  [in]  msp is a pointer to frwd_struct structure
+     *
+     */
+    template<int X>
+    static inline
+    void frwd_read(frwd_struct *msp)
+    {
+      assert(msp->bits <= 128);
+
+      v128_t offset, val, validity, all_xff;
+      val = vsx_v128_load(msp->data);
+      int bytes = msp->size >= 16 ? 16 : msp->size;
+      validity = vsx_i8x16_splat((char)bytes);
+      msp->data += bytes;
+      msp->size -= bytes;
+      ui32 bits = 128;
+      offset = vsx_i64x2_const(0x0706050403020100,0x0F0E0D0C0B0A0908);
+      validity = vsx_i8x16_gt(validity, offset);
+      all_xff = vsx_i8x16_const(OJPH_REPEAT16(-1));
+      if (X == 0xFF) // the compiler should remove this if statement
+      {
+        v128_t t = vsx_v128_xor(validity, all_xff); // complement
+        val = vsx_v128_or(t, val); // fill with 0xFF
+      }
+      else if (X == 0)
+        val = vsx_v128_and(validity, val); // fill with zeros
+      else
+        assert(0);
+
+      v128_t ff_bytes;
+      ff_bytes = vsx_i8x16_eq(val, all_xff);
+      ff_bytes = vsx_v128_and(ff_bytes, validity);
+      ui32 flags = vsx_i8x16_bitmask(ff_bytes);
+      flags <<= 1; // unstuff following byte
+      ui32 next_unstuff = flags >> 16;
+      flags |= msp->unstuff;
+      flags &= 0xFFFF;
+      while (flags)
+      { // bit unstuffing occurs on average once every 256 bytes
+        // therefore it is not an issue if it is a bit slow
+        // here we process 16 bytes
+        --bits; // consuming one stuffing bit
+
+        ui32 loc = 31 - count_leading_zeros(flags);
+        flags ^= 1 << loc;
+
+        v128_t m, t, c;
+        t = vsx_i8x16_splat((char)loc);
+        m = vsx_i8x16_gt(offset, t);
+
+        t = vsx_v128_and(m, val);    // keep bits at locations larger than loc
+        c = vsx_u64x2_shr(t, 1);     // 1 bits left
+        t = vsx_i64x2_shuffle(t, vsx_i64x2_const(0, 0), 1, 2);
+        t = vsx_i64x2_shl(t, 63);    // keep the MSB only
+        t = vsx_v128_or(t, c);       // combine the above 3 steps
+
+        val = vsx_v128_or(t, vsx_v128_andnot(val, m));
+      }
+
+      // combine with earlier data
+      assert(msp->bits >= 0 && msp->bits <= 128);
+      int cur_bytes = msp->bits >> 3;
+      ui32 cur_bits = msp->bits & 7;
+      v128_t b1, b2;
+      b1 = vsx_i64x2_shl(val, cur_bits);
+      //next shift 8 bytes right
+      b2 = vsx_i64x2_shuffle(vsx_i64x2_const(0, 0), val, 1, 2);
+      b2 = vsx_u64x2_shr(b2, 64u - cur_bits);
+      b2 = (cur_bits > 0) ? b2 : vsx_i64x2_const(0, 0);
+      b1 = vsx_v128_or(b1, b2);
+      b2 = vsx_v128_load(msp->tmp + cur_bytes);
+      b2 = vsx_v128_or(b1, b2);
+      vsx_v128_store(msp->tmp + cur_bytes, b2);
+
+      ui32 consumed_bits = bits < 128u - cur_bits ? bits : 128u - cur_bits;
+      cur_bytes = (msp->bits + consumed_bits + 7) >> 3; // round up
+      int upper = vsx_u16x8_extract_lane(val, 7);
+      upper >>= consumed_bits + 16 - 128;
+      msp->tmp[cur_bytes] = (ui8)upper; // copy byte
+
+      msp->bits += bits;
+      msp->unstuff = next_unstuff;   // next unstuff
+      assert(msp->unstuff == 0 || msp->unstuff == 1);
+    }
+
+    //************************************************************************/
+    /** @brief Initialize frwd_struct struct and reads some bytes
+     *
+     *  @tparam      X is the value fed in when the bitstream is exhausted.
+     *               See frwd_read regarding the template
+     *  @param [in]  msp is a pointer to frwd_struct
+     *  @param [in]  data is a pointer to the start of data
+     *  @param [in]  size is the number of byte in the bitstream
+     */
+    template<int X>
+    static inline
+    void frwd_init(frwd_struct *msp, const ui8* data, int size)
+    {
+      msp->data = data;
+      vsx_v128_store(msp->tmp, vsx_i64x2_const(0, 0));
+      vsx_v128_store(msp->tmp + 16, vsx_i64x2_const(0, 0));
+      vsx_v128_store(msp->tmp + 32, vsx_i64x2_const(0, 0));
+
+      msp->bits = 0;
+      msp->unstuff = 0;
+      msp->size = size;
+
+      frwd_read<X>(msp); // read 128 bits more
+    }
+
+    //************************************************************************/
+    /** @brief Consume num_bits bits from the bitstream of frwd_struct
+     *
+     *  @param [in]  msp is a pointer to frwd_struct
+     *  @param [in]  num_bits is the number of bit to consume
+     */
+    static inline
+    void frwd_advance(frwd_struct *msp, ui32 num_bits)
+    {
+      assert(num_bits > 0 && num_bits <= msp->bits && num_bits < 128);
+      msp->bits -= num_bits;
+
+      v128_t *p = (v128_t*)(msp->tmp + ((num_bits >> 3) & 0x18));
+      num_bits &= 63;
+
+      v128_t v0, v1, c0, c1, t;
+      v0 = vsx_v128_load(p);
+      v1 = vsx_v128_load(p + 1);
+
+      // shift right by num_bits
+      c0 = vsx_u64x2_shr(v0, num_bits);
+      t = vsx_i64x2_shuffle(v0, vsx_i64x2_const(0, 0), 1, 2);
+      t = vsx_i64x2_shl(t, 64 - num_bits);
+      t = (num_bits > 0) ? t : vsx_i64x2_const(0, 0);
+      c0 = vsx_v128_or(c0, t);
+      t = vsx_i64x2_shuffle(vsx_i64x2_const(0, 0), v1, 1, 2);
+      t = vsx_i64x2_shl(t, 64 - num_bits);
+      t = (num_bits > 0) ? t : vsx_i64x2_const(0, 0);
+      c0 = vsx_v128_or(c0, t);
+
+      vsx_v128_store(msp->tmp, c0);
+
+      c1 = vsx_u64x2_shr(v1, num_bits);
+      t = vsx_i64x2_shuffle(v1, vsx_i64x2_const(0, 0), 1, 2);
+      t = vsx_i64x2_shl(t, 64 - num_bits);
+      t = (num_bits > 0) ? t : vsx_i64x2_const(0, 0);
+      c1 = vsx_v128_or(c1, t);
+
+      vsx_v128_store(msp->tmp + 16, c1);
+    }
+
+    //************************************************************************/
+    /** @brief Fetches 32 bits from the frwd_struct bitstream
+     *
+     *  @tparam      X is the value fed in when the bitstream is exhausted.
+     *               See frwd_read regarding the template
+     *  @param [in]  msp is a pointer to frwd_struct
+     */
+    template<int X>
+    static inline
+    v128_t frwd_fetch(frwd_struct *msp)
+    {
+      if (msp->bits <= 128)
+      {
+        frwd_read<X>(msp);
+        if (msp->bits <= 128) //need to test
+          frwd_read<X>(msp);
+      }
+      v128_t t = vsx_v128_load(msp->tmp);
+      return t;
+    }
+
+    //************************************************************************/
+    /** @brief Destuffs a bitstream into a contiguous buffer, upfront
+     *
+     *  Removes bit stuffing once for the whole stream, so that fetching
+     *  bits at a given position later needs no serial state; this keeps
+     *  the per-quad bit-consumption chain in general-purpose registers
+     *  (pos += bits) instead of shifting a vector-resident window, which
+     *  on POWER costs vector stores/reloads through the stack.
+     *
+     *  @tparam      X is the value fed in when the bitstream is exhausted
+     *  @param [in]  src is a pointer to the start of the bitstream
+     *  @param [in]  size is the number of bytes in the bitstream
+     *  @param [in]  dst is the destination buffer, of capacity cap + 72
+     *  @param [in]  cap is the destination capacity, excluding padding
+     *  @return      the clamp offset; bytes at or beyond this offset hold
+     *               no stream bits (they read as X)
+     */
+    template<int X>
+    static inline
+    ui32 destuff_frwd(const ui8* src, int size, ui8* dst, ui32 cap)
+    {
+      if (size < 0)
+        size = 0;
+      ui8* o = dst;
+      ui8* o_end = dst + cap;
+      const ui8* s = src;
+      const ui8* s_end = src + size;
+      ui64 acc = 0;       // partial output byte, low nb bits are valid
+      ui32 nb = 0;        // number of valid bits in acc; always < 8
+      bool prev_ff = false;
+
+      // fast path; 16 source bytes at a time when they contain no 0xFF
+      while (s + 16 <= s_end && o + 24 <= o_end)
+      {
+        v128_t v = vsx_v128_load(s);
+        if (vec_any_eq((vsx_v_u8)v, vec_splats((unsigned char)0xFF))
+            || prev_ff)
+        { // process these 16 bytes one at a time
+          for (int i = 0; i < 16; ++i) {
+            ui8 b = *s++;
+            acc |= (ui64)b << nb;
+            nb += prev_ff ? 7u : 8u;
+            prev_ff = (b == 0xFFu);
+            if (nb >= 8) { *o++ = (ui8)acc; acc >>= 8; nb -= 8; }
+          }
+          continue;
+        }
+        ui64 v0, v1;
+        memcpy(&v0, s, 8);
+        memcpy(&v1, s + 8, 8);
+        ui64 w0 = acc | (v0 << nb);
+        ui64 w1 = (v1 << nb) | (nb ? (v0 >> (64 - nb)) : 0);
+        memcpy(o, &w0, 8);
+        memcpy(o + 8, &w1, 8);
+        acc = nb ? (v1 >> (64 - nb)) : 0;
+        o += 16;
+        s += 16;
+      }
+      // tail; one byte at a time
+      while (s < s_end && o < o_end)
+      {
+        ui8 b = *s++;
+        acc |= (ui64)b << nb;
+        nb += prev_ff ? 7u : 8u;
+        prev_ff = (b == 0xFFu);
+        if (nb >= 8) { *o++ = (ui8)acc; acc >>= 8; nb -= 8; }
+      }
+      // fill the bits above nb with X, and pad with X bytes
+      ui32 fill = (X == 0xFF) ? (0xFFu << nb) : 0;
+      *o = (ui8)((ui32)acc | fill);
+      memset(o + 1, X, 64);
+      return (ui32)(o - dst) + 1;
+    }
+
+    //************************************************************************/
+    /** @brief Fetches 128 bits from a destuffed bitstream buffer
+     *
+     *  Returns the 128 bits starting at bit position pos of the buffer
+     *  produced by destuff_frwd.  Carries no serial state; fetches at
+     *  independent positions can execute out of order.  The byte offset
+     *  is clamped to limit so that positions past the end of the stream
+     *  read from the padding without leaving the buffer.
+     *
+     *  @param [in]  dbuf is the destuffed bitstream buffer
+     *  @param [in]  limit is the clamp offset returned by destuff_frwd
+     *  @param [in]  pos is the absolute bit position to fetch from
+     */
+    static inline
+    v128_t vsx_dfetch(const ui8* dbuf, ui32 limit, ui32 pos)
+    {
+      ui32 off = pos >> 3;
+      off = off < limit ? off : limit;
+      const ui8* p = dbuf + off;
+      v128_t v = vsx_v128_load(p);
+      v128_t w = vsx_v128_load(p + 8);
+      int k = (int)(pos & 7);
+      v128_t r = vsx_u64x2_shr(v, k);
+      // shift left by 64 - k without branching on k == 0; vector shifts
+      // are modulo 64, so the shift is split into 1 and 63 - k
+      v128_t c = vsx_i64x2_shl(vsx_i64x2_shl(w, 1), 63 - k);
+      return vsx_v128_or(r, c);
+    }
+
+    //************************************************************************/
+    /** @brief decodes one quad, using 32 bit data
+     *
+     *  @tparam N       0 for the first quad and 1 for the second quad in an
+     *                  octet
+     *  @param inf_u_q  decoded VLC code, with interleaved u values
+     *  @param U_q      U values
+     *  @param magsgn   structure for forward data buffer
+     *  @param p        bitplane at which we are decoding
+     *  @param vn       used for handling E values (stores v_n values)
+     *  @return v128_t decoded quad
+     */
+    template <int N>
+    static inline
+    v128_t decode_one_quad32(const v128_t inf_u_q, v128_t U_q,
+                              frwd_struct* magsgn, ui32 p, v128_t& vn)
+    {
+      v128_t w0;    // workers
+      v128_t insig; // lanes hold FF's if samples are insignificant
+      v128_t flags; // lanes hold e_k, e_1, and rho
+      v128_t row;   // decoded row
+
+      row = vsx_i64x2_const(0, 0);
+      w0 = vsx_i32x4_shuffle(inf_u_q, inf_u_q, N, N, N, N);
+      // we keeps e_k, e_1, and rho in w2
+      flags = vsx_v128_and(w0, vsx_i32x4_const(0x1110,0x2220,0x4440,0x8880));
+      insig = vsx_i32x4_eq(flags, vsx_i64x2_const(0, 0));
+      if (vsx_i8x16_bitmask(insig) != 0xFFFF) //are all insignificant?
+      {
+        U_q = vsx_i32x4_shuffle(U_q, U_q, N, N, N, N);
+        flags = vsx_i16x8_mul(flags, vsx_i16x8_const(8,8,4,4,2,2,1,1));
+        v128_t ms_vec = frwd_fetch<0xFF>(magsgn);
+
+        // U_q holds U_q for this quad
+        // flags has e_k, e_1, and rho such that e_k is sitting in the
+        // 0x8000, e_1 in 0x800, and rho in 0x80
+
+        // next e_k and m_n
+        v128_t m_n;
+        w0 = vsx_u32x4_shr(flags, 15); // e_k
+        m_n = vsx_i32x4_sub(U_q, w0);
+        m_n = vsx_v128_andnot(m_n, insig);
+
+        // find cumulative sums
+        // to find at which bit in ms_vec the sample starts
+        v128_t ex_sum, shfl, inc_sum = m_n; // inclusive scan
+        shfl = vsx_i32x4_shuffle(vsx_i64x2_const(0,0), inc_sum, 3, 4, 5, 6);
+        inc_sum = vsx_i32x4_add(inc_sum, shfl);
+        shfl = vsx_i64x2_shuffle(vsx_i64x2_const(0,0), inc_sum, 1, 2);
+        inc_sum = vsx_i32x4_add(inc_sum, shfl);
+        int total_mn = vsx_u16x8_extract_lane(inc_sum, 6);
+        ex_sum = vsx_i32x4_shuffle(vsx_i64x2_const(0,0), inc_sum, 3, 4, 5, 6);
+
+        // find the starting byte and starting bit
+        v128_t byte_idx = vsx_u32x4_shr(ex_sum, 3);
+        v128_t bit_idx =
+          vsx_v128_and(ex_sum, vsx_i32x4_const(OJPH_REPEAT4(7)));
+        byte_idx = vsx_i8x16_swizzle(byte_idx,
+          vsx_i32x4_const(0x00000000, 0x04040404, 0x08080808, 0x0C0C0C0C));
+        byte_idx =
+          vsx_i32x4_add(byte_idx, vsx_i32x4_const(OJPH_REPEAT4(0x03020100)));
+        v128_t d0 = vsx_i8x16_swizzle(ms_vec, byte_idx);
+        byte_idx =
+          vsx_i32x4_add(byte_idx, vsx_i32x4_const(OJPH_REPEAT4(0x01010101)));
+        v128_t d1 = vsx_i8x16_swizzle(ms_vec, byte_idx);
+
+        // shift samples values to correct location
+        bit_idx = vsx_v128_or(bit_idx, vsx_i32x4_shl(bit_idx, 16));
+        v128_t bit_shift = vsx_i8x16_swizzle(
+          vsx_i8x16_const(-1, 127, 63, 31, 15, 7, 3, 1,
+                           -1, 127, 63, 31, 15, 7, 3, 1), bit_idx);
+        bit_shift =
+          vsx_i16x8_add(bit_shift, vsx_i16x8_const(OJPH_REPEAT8(0x0101)));
+        d0 = vsx_i16x8_mul(d0, bit_shift);
+        d0 = vsx_u16x8_shr(d0, 8); // we should have 8 bits in the LSB
+        d1 = vsx_i16x8_mul(d1, bit_shift);
+        d1 =  // 8 in MSB
+          vsx_v128_and(d1, vsx_u32x4_const(OJPH_REPEAT4(0xFF00FF00)));
+        d0 = vsx_v128_or(d0, d1);
+
+        // find location of e_k and mask
+        v128_t shift;
+        v128_t ones = vsx_i32x4_const(OJPH_REPEAT4(1));
+        v128_t twos = vsx_i32x4_const(OJPH_REPEAT4(2));
+        ui32 U_q_m1 = vsx_u32x4_extract_lane(U_q, 0) - 1u;
+        w0 = vsx_i32x4_sub(twos, w0);
+        shift = vsx_i32x4_shl(w0, U_q_m1);
+        ms_vec = vsx_v128_and(d0, vsx_i32x4_sub(shift, ones));
+
+        // next e_1
+        w0 = vsx_v128_and(flags, vsx_i32x4_const(OJPH_REPEAT4(0x800)));
+        w0 = vsx_i32x4_eq(w0, vsx_i64x2_const(0, 0));
+        w0 = vsx_v128_andnot(shift, w0);  // e_1 in correct position
+        ms_vec = vsx_v128_or(ms_vec, w0); // e_1
+        w0 = vsx_i32x4_shl(ms_vec, 31);   // sign
+        ms_vec = vsx_v128_or(ms_vec, ones); // bin center
+        v128_t tvn = ms_vec;
+        ms_vec = vsx_i32x4_add(ms_vec, twos);// + 2
+        ms_vec = vsx_i32x4_shl(ms_vec, p - 1);
+        ms_vec = vsx_v128_or(ms_vec, w0); // sign
+        row = vsx_v128_andnot(ms_vec, insig); // significant only
+
+        ms_vec = vsx_v128_andnot(tvn, insig); // significant only
+        if (N == 0) // the compiler should remove one
+          tvn = vsx_i8x16_swizzle(ms_vec,
+            vsx_i32x4_const(0x07060504, 0x0F0E0D0C, -1, -1));
+        else if (N == 1)
+          tvn = vsx_i8x16_swizzle(ms_vec,
+            vsx_i32x4_const(-1, 0x07060504, 0x0F0E0D0C, -1));
+        else
+          assert(0);
+        vn = vsx_v128_or(vn, tvn);
+
+        if (total_mn)
+          frwd_advance(magsgn, (ui32)total_mn);
+      }
+      return row;
+    }
+
+   //************************************************************************/
+    /** @brief decodes twos consecutive quads (one octet), using 16 bit data
+     *
+     *  @param inf_u_q  decoded VLC code, with interleaved u values
+     *  @param U_q      U values
+     *  @param magsgn   structure for forward data buffer
+     *  @param p        bitplane at which we are decoding
+     *  @param vn       used for handling E values (stores v_n values)
+     *  @return v128_t decoded quad
+     */
+    static inline
+    v128_t decode_two_quad16(const v128_t inf_u_q, v128_t U_q,
+                              const ui8* dbuf, ui32 limit, ui32& pos,
+                              ui32 p, v128_t& vn)
+    {
+      v128_t w0;     // workers
+      v128_t insig;  // lanes hold FF's if samples are insignificant
+      v128_t flags;  // lanes hold e_k, e_1, and rho
+      v128_t row;    // decoded row
+
+      row = vsx_i64x2_const(0, 0);
+      w0 = vsx_i8x16_swizzle(inf_u_q,
+        vsx_i16x8_const(0x0100, 0x0100, 0x0100, 0x0100,
+                         0x0504, 0x0504, 0x0504, 0x0504));
+      // we keeps e_k, e_1, and rho in w2
+      flags = vsx_v128_and(w0,
+        vsx_u16x8_const(0x1110, 0x2220, 0x4440, 0x8880,
+                         0x1110, 0x2220, 0x4440, 0x8880));
+      insig = vsx_i16x8_eq(flags, vsx_i64x2_const(0, 0));
+      if (vsx_i8x16_bitmask(insig) != 0xFFFF) //are all insignificant?
+      {
+        U_q = vsx_i8x16_swizzle(U_q,
+          vsx_i16x8_const(0x0100, 0x0100, 0x0100, 0x0100,
+                           0x0504, 0x0504, 0x0504, 0x0504));
+        flags = vsx_i16x8_mul(flags, vsx_i16x8_const(8,4,2,1,8,4,2,1));
+        v128_t ms_vec = vsx_dfetch(dbuf, limit, pos);
+
+        // U_q holds U_q for this quad
+        // flags has e_k, e_1, and rho such that e_k is sitting in the
+        // 0x8000, e_1 in 0x800, and rho in 0x80
+
+        // next e_k and m_n
+        v128_t m_n;
+        w0 = vsx_u16x8_shr(flags, 15); // e_k
+        m_n = vsx_i16x8_sub(U_q, w0);
+        m_n = vsx_v128_andnot(m_n, insig);
+
+        // find cumulative sums
+        // to find at which bit in ms_vec the sample starts
+        v128_t ex_sum, shfl, inc_sum = m_n; // inclusive scan
+        shfl = vsx_i16x8_shuffle(vsx_i64x2_const(0,0),
+          inc_sum, 7, 8, 9, 10, 11, 12, 13, 14);
+        inc_sum = vsx_i16x8_add(inc_sum, shfl);
+        shfl = vsx_i32x4_shuffle(vsx_i64x2_const(0,0), inc_sum, 3, 4, 5, 6);
+        inc_sum = vsx_i16x8_add(inc_sum, shfl);
+        shfl = vsx_i64x2_shuffle(vsx_i64x2_const(0,0), inc_sum, 1, 2);
+        inc_sum = vsx_i16x8_add(inc_sum, shfl);
+        int total_mn = vsx_u16x8_extract_lane(inc_sum, 7);
+        ex_sum = vsx_i16x8_shuffle(vsx_i64x2_const(0,0),
+          inc_sum, 7, 8, 9, 10, 11, 12, 13, 14);
+
+        // find the starting byte and starting bit
+        v128_t byte_idx = vsx_u16x8_shr(ex_sum, 3);
+        v128_t bit_idx =
+          vsx_v128_and(ex_sum, vsx_i16x8_const(OJPH_REPEAT8(7)));
+        byte_idx = vsx_i8x16_swizzle(byte_idx,
+          vsx_i16x8_const(0x0000, 0x0202, 0x0404, 0x0606,
+                           0x0808, 0x0A0A, 0x0C0C, 0x0E0E));
+        byte_idx =
+          vsx_i16x8_add(byte_idx, vsx_i16x8_const(OJPH_REPEAT8(0x0100)));
+        v128_t d0 = vsx_i8x16_swizzle(ms_vec, byte_idx);
+        byte_idx =
+          vsx_i16x8_add(byte_idx, vsx_i16x8_const(OJPH_REPEAT8(0x0101)));
+        v128_t d1 = vsx_i8x16_swizzle(ms_vec, byte_idx);
+
+        // shift samples values to correct location
+        v128_t bit_shift = vsx_i8x16_swizzle(
+          vsx_i8x16_const(-1, 127, 63, 31, 15, 7, 3, 1,
+                           -1, 127, 63, 31, 15, 7, 3, 1), bit_idx);
+        bit_shift =
+          vsx_i16x8_add(bit_shift, vsx_i16x8_const(OJPH_REPEAT8(0x0101)));
+        d0 = vsx_i16x8_mul(d0, bit_shift);
+        d0 = vsx_u16x8_shr(d0, 8); // we should have 8 bits in the LSB
+        d1 = vsx_i16x8_mul(d1, bit_shift);
+        d1 = // 8 in MSB
+          vsx_v128_and(d1, vsx_i16x8_const(OJPH_REPEAT8((si16)0xFF00)));
+        d0 = vsx_v128_or(d0, d1);
+
+        // find location of e_k and mask
+        v128_t shift, t0, t1;
+        v128_t ones = vsx_i16x8_const(OJPH_REPEAT8(1));
+        v128_t twos = vsx_i16x8_const(OJPH_REPEAT8(2));
+        v128_t U_q_m1 = vsx_i32x4_sub(U_q, ones);
+        ui32 Uq0 = vsx_u16x8_extract_lane(U_q_m1, 0);
+        ui32 Uq1 = vsx_u16x8_extract_lane(U_q_m1, 4);
+        w0 = vsx_i16x8_sub(twos, w0);
+        t0 = vsx_v128_and(w0, vsx_i64x2_const(-1, 0));
+        t1 = vsx_v128_and(w0, vsx_i64x2_const(0, -1));
+        t0 = vsx_i32x4_shl(t0, Uq0);
+        t1 = vsx_i32x4_shl(t1, Uq1);
+        shift = vsx_v128_or(t0, t1);
+        ms_vec = vsx_v128_and(d0, vsx_i16x8_sub(shift, ones));
+
+        // next e_1
+        w0 = vsx_v128_and(flags, vsx_i16x8_const(OJPH_REPEAT8(0x800)));
+        w0 = vsx_i16x8_eq(w0, vsx_i64x2_const(0, 0));
+        w0 = vsx_v128_andnot(shift, w0);  // e_1 in correct position
+        ms_vec = vsx_v128_or(ms_vec, w0); // e_1
+        w0 = vsx_i16x8_shl(ms_vec, 15);   // sign
+        ms_vec = vsx_v128_or(ms_vec, ones); // bin center
+        v128_t tvn = ms_vec;
+        ms_vec = vsx_i16x8_add(ms_vec, twos);// + 2
+        ms_vec = vsx_i16x8_shl(ms_vec, p - 1);
+        ms_vec = vsx_v128_or(ms_vec, w0); // sign
+        row = vsx_v128_andnot(ms_vec, insig); // significant only
+
+        ms_vec = vsx_v128_andnot(tvn, insig); // significant only
+        w0 = vsx_i8x16_swizzle(ms_vec,
+          vsx_i16x8_const(0x0302, 0x0706, -1, -1, -1, -1, -1, -1));
+        vn = vsx_v128_or(vn, w0);
+        w0 = vsx_i8x16_swizzle(ms_vec,
+          vsx_i16x8_const(-1, 0x0B0A, 0x0F0E, -1, -1, -1, -1, -1));
+        vn = vsx_v128_or(vn, w0);
+
+        pos += (ui32)total_mn;
+      }
+      return row;
+    }
+
+
+    //************************************************************************/
+    /** @brief Decodes one codeblock, processing the cleanup, siginificance
+     *         propagation, and magnitude refinement pass
+     *
+     *  @param [in]   coded_data is a pointer to bitstream
+     *  @param [in]   decoded_data is a pointer to decoded codeblock data buf.
+     *  @param [in]   missing_msbs is the number of missing MSBs
+     *  @param [in]   num_passes is the number of passes: 1 if CUP only,
+     *                2 for CUP+SPP, and 3 for CUP+SPP+MRP
+     *  @param [in]   lengths1 is the length of cleanup pass
+     *  @param [in]   lengths2 is the length of refinement passes (either SPP
+     *                only or SPP+MRP)
+     *  @param [in]   width is the decoded codeblock width
+     *  @param [in]   height is the decoded codeblock height
+     *  @param [in]   stride is the decoded codeblock buffer stride
+     *  @param [in]   stripe_causal is true for stripe causal mode
+     */
+    bool ojph_decode_codeblock_vsx(ui8* coded_data, ui32* decoded_data,
+                                    ui32 missing_msbs, ui32 num_passes,
+                                    ui32 lengths1, ui32 lengths2,
+                                    ui32 width, ui32 height, ui32 stride,
+                                    bool stripe_causal)
+    {
+      static bool insufficient_precision = false;
+      static bool modify_code = false;
+      static bool truncate_spp_mrp = false;
+
+      if (num_passes > 1 && lengths2 == 0)
+      {
+        OJPH_WARN(0x00010001, "A malformed codeblock that has more than "
+                              "one coding pass, but zero length for "
+                              "2nd and potential 3rd pass.\n");
+        num_passes = 1;
+      }
+
+      if (num_passes > 3)
+      {
+        OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; "
+                              "This codeblocks has %d passes.\n",
+                              num_passes);
+        return false;
+      }
+
+      if (missing_msbs > 30) // p < 0
+      {
+        if (insufficient_precision == false)
+        {
+          insufficient_precision = true;
+          OJPH_WARN(0x00010003, "32 bits are not enough to decode this "
+                                "codeblock. This message will not be "
+                                "displayed again.\n");
+        }
+        return false;
+      }
+      else if (missing_msbs == 30) // p == 0
+      { // not enough precision to decode and set the bin center to 1
+        if (modify_code == false) {
+          modify_code = true;
+          OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup "
+                                "pass. The code can be modified to support "
+                                "this case. This message will not be "
+                                "displayed again.\n");
+        }
+         return false;         // 32 bits are not enough to decode this
+       }
+      else if (missing_msbs == 29) // if p is 1, then num_passes must be 1
+      {
+        if (num_passes > 1) {
+          num_passes = 1;
+          if (truncate_spp_mrp == false) {
+            truncate_spp_mrp = true;
+            OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp "
+                                  "nor MagRef passes; both will be skipped. "
+                                  "This message will not be displayed "
+                                  "again.\n");
+          }
+        }
+      }
+      ui32 p = 30 - missing_msbs; // The least significant bitplane for CUP
+      // There is a way to handle the case of p == 0, but a different path
+      // is required
+
+      if (lengths1 < 2)
+      {
+        OJPH_WARN(0x00010006, "Wrong codeblock length.\n");
+        return false;
+      }
+
+      // read scup and fix the bytes there
+      int lcup, scup;
+      lcup = (int)lengths1;  // length of CUP
+      //scup is the length of MEL + VLC
+      scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF);
+      if (scup < 2 || scup > lcup || scup > 4079) //something is wrong
+        return false;
+
+      // The temporary storage scratch holds two types of data in an
+      // interleaved fashion. The interleaving allows us to use one
+      // memory pointer.
+      // We have one entry for a decoded VLC code, and one entry for UVLC.
+      // Entries are 16 bits each, corresponding to one quad,
+      // but since we want to use XMM registers of the SSE family
+      // of SIMD; we allocated 16 bytes or more per quad row; that is,
+      // the width is no smaller than 16 bytes (or 8 entries), and the
+      // height is 512 quads
+      // Each VLC entry contains, in the following order, starting
+      // from MSB
+      // e_k (4bits), e_1 (4bits), rho (4bits), useless for step 2 (4bits)
+      // Each entry in UVLC contains u_q
+      // One extra row to handle the case of SPP propagating downwards
+      // when codeblock width is 4
+      ui16 scratch[8 * 513] = {0};          // 8+ kB
+
+      // We need an extra two entries (one inf and one u_q) beyond
+      // the last column.
+      // If the block width is 4 (2 quads), then we use sstr of 8
+      // (enough for 4 quads). If width is 8 (4 quads) we use
+      // sstr is 16 (enough for 8 quads). For a width of 16 (8
+      // quads), we use 24 (enough for 12 quads).
+      ui32 sstr = ((width + 2u) + 7u) & ~7u; // multiples of 8
+
+      assert((stride & 0x3) == 0);
+
+      ui32 mmsbp2 = missing_msbs + 2;
+
+      // The cleanup pass is decoded in two steps; in step one,
+      // the VLC and MEL segments are decoded, generating a record that
+      // has 2 bytes per quad. The 2 bytes contain, u, rho, e^1 & e^k.
+      // This information should be sufficient for the next step.
+      // In step 2, we decode the MagSgn segment.
+
+      // step 1 decoding VLC and MEL segments
+      {
+        // init structures
+        dec_mel_st mel;
+        mel_init(&mel, coded_data, lcup, scup);
+        rev_struct vlc;
+        rev_init(&vlc, coded_data, lcup, scup);
+
+        int run = mel_get_run(&mel); // decode runs of events from MEL bitstrm
+                                     // data represented as runs of 0 events
+                                     // See mel_decode description
+
+        ui32 vlc_val;
+        ui32 c_q = 0;
+        ui16 *sp = scratch;
+        //initial quad row
+        for (ui32 x = 0; x < width; sp += 4)
+        {
+          // decode VLC
+          /////////////
+
+          // first quad
+          vlc_val = rev_fetch(&vlc);
+
+          //decode VLC using the context c_q and the head of VLC bitstream
+          ui16 t0 = vlc_tbl0[ c_q + (vlc_val & 0x7F) ];
+
+          // if context is zero, use one MEL event
+          if (c_q == 0) //zero context
+          {
+            run -= 2; //subtract 2, since events number if multiplied by 2
+
+            // Is the run terminated in 1? if so, use decoded VLC code,
+            // otherwise, discard decoded data, since we will decoded again
+            // using a different context
+            t0 = (run == -1) ? t0 : 0;
+
+            // is run -1 or -2? this means a run has been consumed
+            if (run < 0)
+              run = mel_get_run(&mel);  // get another run
+          }
+          //run -= (c_q == 0) ? 2 : 0;
+          //t0 = (c_q != 0 || run == -1) ? t0 : 0;
+          //if (run < 0)
+          //  run = mel_get_run(&mel);  // get another run
+          sp[0] = t0;
+          x += 2;
+
+          // prepare context for the next quad; eqn. 1 in ITU T.814
+          c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2);
+
+          //remove data from vlc stream (0 bits are removed if vlc is not used)
+          vlc_val = rev_advance(&vlc, t0 & 0x7);
+
+          //second quad
+          ui16 t1 = 0;
+
+          //decode VLC using the context c_q and the head of VLC bitstream
+          t1 = vlc_tbl0[c_q + (vlc_val & 0x7F)];
+
+          // if context is zero, use one MEL event
+          if (c_q == 0 && x < width) //zero context
+          {
+            run -= 2; //subtract 2, since events number if multiplied by 2
+
+            // if event is 0, discard decoded t1
+            t1 = (run == -1) ? t1 : 0;
+
+            if (run < 0) // have we consumed all events in a run
+              run = mel_get_run(&mel); // if yes, then get another run
+          }
+          t1 = x < width ? t1 : 0;
+          //run -= (c_q == 0 && x < width) ? 2 : 0;
+          //t1 = (c_q != 0 || run == -1) ? t1 : 0;
+          //if (run < 0)
+          //  run = mel_get_run(&mel);  // get another run
+          sp[2] = t1;
+          x += 2;
+
+          //prepare context for the next quad, eqn. 1 in ITU T.814
+          c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2);
+
+          //remove data from vlc stream, if qinf is not used, cwdlen is 0
+          vlc_val = rev_advance(&vlc, t1 & 0x7);
+
+          // decode u
+          /////////////
+          // uvlc_mode is made up of u_offset bits from the quad pair
+          ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
+          if (uvlc_mode == 0xc0)// if both u_offset are set, get an event from
+          {                     // the MEL run of events
+            run -= 2; //subtract 2, since events number if multiplied by 2
+
+            uvlc_mode += (run == -1) ? 0x40 : 0; // increment uvlc_mode by
+                                                 // is 0x40
+
+            if (run < 0)//if run is consumed (run is -1 or -2), get another run
+              run = mel_get_run(&mel);
+          }
+          //run -= (uvlc_mode == 0xc0) ? 2 : 0;
+          //uvlc_mode += (uvlc_mode == 0xc0 && run == -1) ? 0x40 : 0;
+          //if (run < 0)
+          //  run = mel_get_run(&mel);  // get another run
+
+          //decode uvlc_mode to get u for both quads
+          ui32 uvlc_entry = uvlc_tbl0[uvlc_mode + (vlc_val & 0x3F)];
+          //remove total prefix length
+          vlc_val = rev_advance(&vlc, uvlc_entry & 0x7);
+          uvlc_entry >>= 3;
+          //extract suffixes for quad 0 and 1
+          ui32 len = uvlc_entry & 0xF;           //suffix length for 2 quads
+          ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads
+          vlc_val = rev_advance(&vlc, len);
+          uvlc_entry >>= 4;
+          // quad 0 length
+          len = uvlc_entry & 0x7; // quad 0 suffix length
+          uvlc_entry >>= 3;
+          ui16 u_q = (ui16)(1 + (uvlc_entry&7) + (tmp&~(0xFFU<<len))); //kap. 1
+          sp[1] = u_q;
+          u_q = (ui16)(1 + (uvlc_entry >> 3) + (tmp >> len));  //kappa == 1
+          sp[3] = u_q;
+        }
+        sp[0] = sp[1] = 0;
+
+        //non initial quad rows
+        for (ui32 y = 2; y < height; y += 2)
+        {
+          c_q = 0;                                // context
+          ui16 *sp = scratch + (y >> 1) * sstr;   // this row of quads
+
+          for (ui32 x = 0; x < width; sp += 4)
+          {
+            // decode VLC
+            /////////////
+
+            // sigma_q (n, ne, nf)
+            c_q |= ((sp[0 - (si32)sstr] & 0xA0U) << 2);
+            c_q |= ((sp[2 - (si32)sstr] & 0x20U) << 4);
+
+            // first quad
+            vlc_val = rev_fetch(&vlc);
+
+            //decode VLC using the context c_q and the head of VLC bitstream
+            ui16 t0 = vlc_tbl1[ c_q + (vlc_val & 0x7F) ];
+
+            // if context is zero, use one MEL event
+            if (c_q == 0) //zero context
+            {
+              run -= 2; //subtract 2, since events number is multiplied by 2
+
+              // Is the run terminated in 1? if so, use decoded VLC code,
+              // otherwise, discard decoded data, since we will decoded again
+              // using a different context
+              t0 = (run == -1) ? t0 : 0;
+
+              // is run -1 or -2? this means a run has been consumed
+              if (run < 0)
+                run = mel_get_run(&mel);  // get another run
+            }
+            //run -= (c_q == 0) ? 2 : 0;
+            //t0 = (c_q != 0 || run == -1) ? t0 : 0;
+            //if (run < 0)
+            //  run = mel_get_run(&mel);  // get another run
+            sp[0] = t0;
+            x += 2;
+
+            // prepare context for the next quad; eqn. 2 in ITU T.814
+            // sigma_q (w, sw)
+            c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1);
+            // sigma_q (nw)
+            c_q |= sp[0 - (si32)sstr] & 0x80;
+            // sigma_q (n, ne, nf)
+            c_q |= ((sp[2 - (si32)sstr] & 0xA0U) << 2);
+            c_q |= ((sp[4 - (si32)sstr] & 0x20U) << 4);
+
+            //remove data from vlc stream (0 bits are removed if vlc is unused)
+            vlc_val = rev_advance(&vlc, t0 & 0x7);
+
+            //second quad
+            ui16 t1 = 0;
+
+            //decode VLC using the context c_q and the head of VLC bitstream
+            t1 = vlc_tbl1[ c_q + (vlc_val & 0x7F)];
+
+            // if context is zero, use one MEL event
+            if (c_q == 0 && x < width) //zero context
+            {
+              run -= 2; //subtract 2, since events number if multiplied by 2
+
+              // if event is 0, discard decoded t1
+              t1 = (run == -1) ? t1 : 0;
+
+              if (run < 0) // have we consumed all events in a run
+                run = mel_get_run(&mel); // if yes, then get another run
+            }
+            t1 = x < width ? t1 : 0;
+            //run -= (c_q == 0 && x < width) ? 2 : 0;
+            //t1 = (c_q != 0 || run == -1) ? t1 : 0;
+            //if (run < 0)
+            //  run = mel_get_run(&mel);  // get another run
+            sp[2] = t1;
+            x += 2;
+
+            // partial c_q, will be completed when we process the next quad
+            // sigma_q (w, sw)
+            c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1);
+            // sigma_q (nw)
+            c_q |= sp[2 - (si32)sstr] & 0x80;
+
+            //remove data from vlc stream, if qinf is not used, cwdlen is 0
+            vlc_val = rev_advance(&vlc, t1 & 0x7);
+
+            // decode u
+            /////////////
+            // uvlc_mode is made up of u_offset bits from the quad pair
+            ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
+            ui32 uvlc_entry = uvlc_tbl1[uvlc_mode + (vlc_val & 0x3F)];
+            //remove total prefix length
+            vlc_val = rev_advance(&vlc, uvlc_entry & 0x7);
+            uvlc_entry >>= 3;
+            //extract suffixes for quad 0 and 1
+            ui32 len = uvlc_entry & 0xF;           //suffix length for 2 quads
+            ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads
+            vlc_val = rev_advance(&vlc, len);
+            uvlc_entry >>= 4;
+            // quad 0 length
+            len = uvlc_entry & 0x7; // quad 0 suffix length
+            uvlc_entry >>= 3;
+            ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFU << len))); //u_q
+            sp[1] = u_q;
+            u_q = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q
+            sp[3] = u_q;
+          }
+          sp[0] = sp[1] = 0;
+        }
+      }
+
+      // step2 we decode magsgn
+      // mmsbp2 equals K_max + 1 (we decode up to K_max bits + 1 sign bit)
+      // The 32 bit path decode 16 bits data, for which one would think
+      // 16 bits are enough, because we want to put in the center of the
+      // bin.
+      // If you have mmsbp2 equals 16 bit, and reversible coding, and
+      // no bitplanes are missing, then we can decoding using the 16 bit
+      // path, but we are not doing this here.
+      if (mmsbp2 >= 16)
+      {
+        // We allocate a scratch row for storing v_n values.
+        // We have 512 quads horizontally.
+        // We may go beyond the last entry by up to 4 entries.
+        // Here we allocate additional 8 entries.
+        // There are two rows in this structure, the bottom
+        // row is used to store processed entries.
+        const int v_n_size = 512 + 8;
+        ui32 v_n_scratch[2 * v_n_size] = {0}; // 4+ kB
+
+        frwd_struct magsgn;
+        frwd_init<0xFF>(&magsgn, coded_data, lcup - scup);
+
+        {
+          ui16 *sp = scratch;
+          ui32 *vp = v_n_scratch;
+          ui32 *dp = decoded_data;
+          vp[0] = 2; // for easy calculation of emax
+
+          for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
+          {
+            //here we process two quads
+            v128_t w0, w1; // workers
+            v128_t inf_u_q, U_q;
+            // determine U_q
+            {
+              inf_u_q = vsx_v128_load(sp);
+              U_q = vsx_u32x4_shr(inf_u_q, 16);
+
+              w0 = vsx_i32x4_gt(U_q, vsx_u32x4_splat(mmsbp2));
+              ui32 i = vsx_i8x16_bitmask(w0);
+              if (i & 0xFF) // only the lower two U_q
+                return false;
+            }
+
+            v128_t vn = vsx_i32x4_const(OJPH_REPEAT4(2));
+            v128_t row0 = decode_one_quad32<0>(inf_u_q, U_q, &magsgn, p, vn);
+            v128_t row1 = decode_one_quad32<1>(inf_u_q, U_q, &magsgn, p, vn);
+            w0 = vsx_v128_load(vp);
+            w0 = vsx_v128_and(w0, vsx_i32x4_const(-1,0,0,0));
+            w0 = vsx_v128_or(w0, vn);
+            vsx_v128_store(vp, w0);
+
+            //interleave in ssse3 style
+
+            w0 = vsx_i32x4_shuffle(row0, row1, 0, 4, 1, 5);
+            w1 = vsx_i32x4_shuffle(row0, row1, 2, 6, 3, 7);
+            row0 = vsx_i32x4_shuffle(w0, w1, 0, 4, 1, 5);
+            row1 = vsx_i32x4_shuffle(w0, w1, 2, 6, 3, 7);
+            vsx_v128_store(dp, row0);
+            vsx_v128_store(dp + stride, row1);
+          }
+        }
+
+        for (ui32 y = 2; y < height; y += 2)
+        {
+          {
+            // perform 31 - count_leading_zeros(*vp) here
+            ui32 *vp = v_n_scratch;
+            const v128_t lut_lo = vsx_i8x16_const(
+              31, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4
+            );
+            const v128_t lut_hi = vsx_i8x16_const(
+              31, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
+            );
+            const v128_t nibble_mask = vsx_i8x16_const(OJPH_REPEAT16(0x0F));
+            const v128_t byte_offset8 = vsx_i16x8_const(OJPH_REPEAT8(8));
+            const v128_t byte_offset16 = vsx_i16x8_const(OJPH_REPEAT8(16));
+            const v128_t cc = vsx_i32x4_const(OJPH_REPEAT4(31));
+            for (ui32 x = 0; x <= width; x += 8, vp += 4)
+            {
+              v128_t v, t; // workers
+              v = vsx_v128_load(vp);
+
+              t = vsx_v128_and(nibble_mask, v);
+              v = vsx_v128_and(vsx_u16x8_shr(v, 4), nibble_mask);
+              t = vsx_i8x16_swizzle(lut_lo, t);
+              v = vsx_i8x16_swizzle(lut_hi, v);
+              v = vsx_u8x16_min(v, t);
+
+              t = vsx_u16x8_shr(v, 8);
+              v = vsx_v128_or(v, byte_offset8);
+              v = vsx_u8x16_min(v, t);
+
+              t = vsx_u32x4_shr(v, 16);
+              v = vsx_v128_or(v, byte_offset16);
+              v = vsx_u8x16_min(v, t);
+
+              v = vsx_i16x8_sub(cc, v);
+              vsx_v128_store(vp + v_n_size, v);
+            }
+          }
+
+          ui32 *vp = v_n_scratch;
+          ui16 *sp = scratch + (y >> 1) * sstr;
+          ui32 *dp = decoded_data + y * stride;
+          vp[0] = 2; // for easy calculation of emax
+
+          for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
+          {
+            //process two quads
+            v128_t w0, w1; // workers
+            v128_t inf_u_q, U_q;
+            // determine U_q
+            {
+              v128_t gamma, emax, kappa, u_q; // needed locally
+
+              inf_u_q = vsx_v128_load(sp);
+              gamma =
+                vsx_v128_and(inf_u_q, vsx_i32x4_const(OJPH_REPEAT4(0xF0)));
+              w0 = vsx_i32x4_sub(gamma, vsx_i32x4_const(OJPH_REPEAT4(1)));
+              gamma = vsx_v128_and(gamma, w0);
+              gamma = vsx_i32x4_eq(gamma, vsx_i64x2_const(0, 0));
+
+              emax = vsx_v128_load(vp + v_n_size);
+              w0 = vsx_i32x4_shuffle(emax, vsx_i64x2_const(0,0), 1, 2, 3, 4);
+              emax = vsx_i16x8_max(w0, emax); // no max_epi32 in ssse3
+              emax = vsx_v128_andnot(emax, gamma);
+
+              kappa = vsx_i32x4_const(OJPH_REPEAT4(1));
+              kappa = vsx_i16x8_max(emax, kappa); // no max_epi32 in ssse3
+
+              u_q = vsx_u32x4_shr(inf_u_q, 16);
+              U_q = vsx_i32x4_add(u_q, kappa);
+
+              w0 = vsx_i32x4_gt(U_q, vsx_u32x4_splat(mmsbp2));
+              ui32 i = vsx_i8x16_bitmask(w0);
+              if (i & 0xFF) // only the lower two U_q
+                return false;
+            }
+
+            v128_t vn = vsx_i32x4_const(OJPH_REPEAT4(2));
+            v128_t row0 = decode_one_quad32<0>(inf_u_q, U_q, &magsgn, p, vn);
+            v128_t row1 = decode_one_quad32<1>(inf_u_q, U_q, &magsgn, p, vn);
+            w0 = vsx_v128_load(vp);
+            w0 = vsx_v128_and(w0, vsx_i32x4_const(-1,0,0,0));
+            w0 = vsx_v128_or(w0, vn);
+            vsx_v128_store(vp, w0);
+
+            //interleave in ssse3 style
+            w0 = vsx_i32x4_shuffle(row0, row1, 0, 4, 1, 5);
+            w1 = vsx_i32x4_shuffle(row0, row1, 2, 6, 3, 7);
+            row0 = vsx_i32x4_shuffle(w0, w1, 0, 4, 1, 5);
+            row1 = vsx_i32x4_shuffle(w0, w1, 2, 6, 3, 7);
+            vsx_v128_store(dp, row0);
+            vsx_v128_store(dp + stride, row1);
+          }
+        }
+      }
+      else
+      {
+        // reduce bitplane by 16 because we now have 16 bits instead of 32
+        p -= 16;
+
+        // We allocate a scratch row for storing v_n values.
+        // We have 512 quads horizontally.
+        // We may go beyond the last entry by up to 8 entries.
+        // Therefore we allocate additional 8 entries.
+        // There are two rows in this structure, the bottom
+        // row is used to store processed entries.
+        const int v_n_size = 512 + 8;
+        ui16 v_n_scratch[2 * v_n_size] = {0}; // 2+ kB
+
+        // destuff the MagSgn bitstream upfront; per-quad consumption then
+        // advances a bit position in a GPR (see destuff_frwd)
+        const ui32 dbuf_cap = 4096 * 15 / 8;
+        ui8 dbuf[dbuf_cap + 72];
+        ui32 limit = destuff_frwd<0xFF>(coded_data, lcup - scup,
+                                        dbuf, dbuf_cap);
+        ui32 pos = 0;
+
+        {
+          ui16 *sp = scratch;
+          ui16 *vp = v_n_scratch;
+          ui32 *dp = decoded_data;
+          vp[0] = 2; // for easy calculation of emax
+
+          for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
+          {
+            //here we process two quads
+            v128_t w0, w1; // workers
+            v128_t inf_u_q, U_q;
+            // determine U_q
+            {
+              inf_u_q = vsx_v128_load(sp);
+              U_q = vsx_u32x4_shr(inf_u_q, 16);
+
+              w0 = vsx_i32x4_gt(U_q, vsx_u32x4_splat(mmsbp2));
+              ui32 i = vsx_i8x16_bitmask(w0);
+              if (i & 0xFF) // only the lower two U_q
+                return false;
+            }
+
+            v128_t vn = vsx_i16x8_const(OJPH_REPEAT8(2));
+            v128_t row = decode_two_quad16(inf_u_q, U_q, dbuf, limit, pos, p, vn);
+            w0 = vsx_v128_load(vp);
+            w0 = vsx_v128_and(w0, vsx_i16x8_const(-1,0,0,0,0,0,0,0));
+            w0 = vsx_v128_or(w0, vn);
+            vsx_v128_store(vp, w0);
+
+            //interleave in ssse3 style
+            w0 = vsx_i8x16_swizzle(row,
+              vsx_i16x8_const(-1, 0x0100, -1, 0x0504,
+                               -1, 0x0908, -1, 0x0D0C));
+            vsx_v128_store(dp, w0);
+            w1 = vsx_i8x16_swizzle(row,
+              vsx_i16x8_const(-1, 0x0302, -1, 0x0706,
+                               -1, 0x0B0A, -1, 0x0F0E));
+            vsx_v128_store(dp + stride, w1);
+          }
+        }
+
+        for (ui32 y = 2; y < height; y += 2)
+        {
+          {
+            // perform 15 - count_leading_zeros(*vp) here
+            ui16 *vp = v_n_scratch;
+            const v128_t lut_lo = vsx_i8x16_const(
+              15, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4
+            );
+            const v128_t lut_hi = vsx_i8x16_const(
+              15, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
+            );
+            const v128_t nibble_mask = vsx_i8x16_const(OJPH_REPEAT16(0x0F));
+            const v128_t byte_offset8 = vsx_i16x8_const(OJPH_REPEAT8(8));
+            const v128_t cc = vsx_i16x8_const(OJPH_REPEAT8(15));
+            for (ui32 x = 0; x <= width; x += 16, vp += 8)
+            {
+              v128_t v, t; // workers
+              v = vsx_v128_load(vp);
+
+              t = vsx_v128_and(nibble_mask, v);
+              v = vsx_v128_and(vsx_u16x8_shr(v, 4), nibble_mask);
+              t = vsx_i8x16_swizzle(lut_lo, t);
+              v = vsx_i8x16_swizzle(lut_hi, v);
+              v = vsx_u8x16_min(v, t);
+
+              t = vsx_u16x8_shr(v, 8);
+              v = vsx_v128_or(v, byte_offset8);
+              v = vsx_u8x16_min(v, t);
+
+              v = vsx_i16x8_sub(cc, v);
+              vsx_v128_store(vp + v_n_size, v);
+            }
+          }
+
+          ui16 *vp = v_n_scratch;
+          ui16 *sp = scratch + (y >> 1) * sstr;
+          ui32 *dp = decoded_data + y * stride;
+          vp[0] = 2; // for easy calculation of emax
+
+          for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
+          {
+            //process two quads
+            v128_t w0, w1; // workers
+            v128_t inf_u_q, U_q;
+            // determine U_q
+            {
+              v128_t gamma, emax, kappa, u_q; // needed locally
+
+              inf_u_q = vsx_v128_load(sp);
+              gamma =
+                vsx_v128_and(inf_u_q, vsx_i32x4_const(OJPH_REPEAT4(0xF0)));
+              w0 = vsx_i32x4_sub(gamma, vsx_i32x4_const(OJPH_REPEAT4(1)));
+              gamma = vsx_v128_and(gamma, w0);
+              gamma = vsx_i32x4_eq(gamma, vsx_i64x2_const(0, 0));
+
+              emax = vsx_v128_load(vp + v_n_size);
+              w0 = vsx_i16x8_shuffle(emax,
+                vsx_i64x2_const(0, 0), 1, 2, 3, 4, 5, 6, 7, 8);
+              emax = vsx_i16x8_max(w0, emax); // no max_epi32 in ssse3
+              emax = vsx_i8x16_swizzle(emax,
+                vsx_i16x8_const(0x0100, -1, 0x0302, -1,
+                                 0x0504, -1, 0x0706, -1));
+              emax = vsx_v128_andnot(emax, gamma);
+
+              kappa = vsx_i32x4_const(OJPH_REPEAT4(1));
+              kappa = vsx_i16x8_max(emax, kappa); // no max_epi32 in ssse3
+
+              u_q = vsx_u32x4_shr(inf_u_q, 16);
+              U_q = vsx_i32x4_add(u_q, kappa);
+
+              w0 = vsx_i32x4_gt(U_q, vsx_u32x4_splat(mmsbp2));
+              ui32 i = vsx_i8x16_bitmask(w0);
+              if (i & 0xFF) // only the lower two U_q
+                return false;
+            }
+
+            v128_t vn = vsx_i16x8_const(OJPH_REPEAT8(2));
+            v128_t row = decode_two_quad16(inf_u_q, U_q, dbuf, limit, pos, p, vn);
+            w0 = vsx_v128_load(vp);
+            w0 = vsx_v128_and(w0, vsx_i16x8_const(-1,0,0,0,0,0,0,0));
+            w0 = vsx_v128_or(w0, vn);
+            vsx_v128_store(vp, w0);
+
+            w0 = vsx_i8x16_swizzle(row,
+              vsx_i16x8_const(-1, 0x0100, -1, 0x0504,
+                               -1, 0x0908, -1, 0x0D0C));
+            vsx_v128_store(dp, w0);
+            w1 = vsx_i8x16_swizzle(row,
+              vsx_i16x8_const(-1, 0x0302, -1, 0x0706,
+                               -1, 0x0B0A, -1, 0x0F0E));
+            vsx_v128_store(dp + stride, w1);
+          }
+        }
+
+        // increase bitplane back by 16 because we need to process 32 bits
+        p += 16;
+      }
+
+      if (num_passes > 1)
+      {
+        // We use scratch again, we can divide it into multiple regions
+        // sigma holds all the significant samples, and it cannot
+        // be modified after it is set.  it will be used during the
+        // Magnitude Refinement Pass
+        ui16* const sigma = scratch;
+
+        ui32 mstr = (width + 3u) >> 2;   // divide by 4, since each
+                                         // ui16 contains 4 columns
+        mstr = ((mstr + 2u) + 7u) & ~7u; // multiples of 8
+
+        // We re-arrange quad significance, where each 4 consecutive
+        // bits represent one quad, into column significance, where,
+        // each 4 consequtive bits represent one column of 4 rows
+        {
+          ui32 y;
+
+          const v128_t mask_3 = vsx_i32x4_const(OJPH_REPEAT4(0x30));
+          const v128_t mask_C = vsx_i32x4_const(OJPH_REPEAT4(0xC0));
+          const v128_t shuffle_mask = vsx_i32x4_const(0x0C080400,-1,-1,-1);
+          for (y = 0; y < height; y += 4)
+          {
+            ui16* sp = scratch + (y >> 1) * sstr;
+            ui16* dp = sigma + (y >> 2) * mstr;
+            for (ui32 x = 0; x < width; x += 8, sp += 8, dp += 2)
+            {
+              v128_t s0, s1, u3, uC, t0, t1;
+
+              s0 = vsx_v128_load(sp);
+              u3 = vsx_v128_and(s0, mask_3);
+              u3 = vsx_u32x4_shr(u3, 4);
+              uC = vsx_v128_and(s0, mask_C);
+              uC = vsx_u32x4_shr(uC, 2);
+              t0 = vsx_v128_or(u3, uC);
+
+              s1 = vsx_v128_load(sp + sstr);
+              u3 = vsx_v128_and(s1, mask_3);
+              u3 = vsx_u32x4_shr(u3, 2);
+              uC = vsx_v128_and(s1, mask_C);
+              t1 = vsx_v128_or(u3, uC);
+
+              v128_t r = vsx_v128_or(t0, t1);
+              r = vsx_i8x16_swizzle(r, shuffle_mask);
+
+              vsx_v128_store32_lane(dp, r, 0);
+            }
+            dp[0] = 0; // set an extra entry on the right with 0
+          }
+          {
+            // reset one row after the codeblock
+            ui16* dp = sigma + (y >> 2) * mstr;
+            v128_t zero = vsx_i64x2_const(0, 0);
+            for (ui32 x = 0; x < width; x += 32, dp += 8)
+              vsx_v128_store(dp, zero);
+            dp[0] = 0; // set an extra entry on the right with 0
+          }
+        }
+
+        // We perform Significance Propagation Pass here
+        {
+          // This stores significance information of the previous
+          // 4 rows.  Significance information in this array includes
+          // all signicant samples in bitplane p - 1; that is,
+          // significant samples for bitplane p (discovered during the
+          // cleanup pass and stored in sigma) and samples that have recently
+          // became significant (during the SPP) in bitplane p-1.
+          // We store enough for the widest row, containing 1024 columns,
+          // which is equivalent to 256 of ui16, since each stores 4 columns.
+          // We add an extra 8 entries, just in case we need more
+          ui16 prev_row_sig[256 + 8] = {0}; // 528 Bytes
+
+          frwd_struct sigprop;
+          frwd_init<0>(&sigprop, coded_data + lengths1, (int)lengths2);
+
+          for (ui32 y = 0; y < height; y += 4)
+          {
+            ui32 pattern = 0xFFFFu; // a pattern needed samples
+            if (height - y < 4) {
+              pattern = 0x7777u;
+              if (height - y < 3) {
+                pattern = 0x3333u;
+                if (height - y < 2)
+                  pattern = 0x1111u;
+              }
+            }
+
+            // prev holds sign. info. for the previous quad, together
+            // with the rows on top of it and below it.
+            ui32 prev = 0;
+            ui16 *prev_sig = prev_row_sig;
+            ui16 *cur_sig = sigma + (y >> 2) * mstr;
+            ui32 *dpp = decoded_data + y * stride;
+            for (ui32 x = 0; x < width; x += 4, dpp += 4, ++cur_sig, ++prev_sig)
+            {
+              // only rows and columns inside the stripe are included
+              si32 s = (si32)x + 4 - (si32)width;
+              s = ojph_max(s, 0);
+              pattern = pattern >> (s * 4);
+
+              // We first find locations that need to be tested (potential
+              // SPP members); these location will end up in mbr
+              // In each iteration, we produce 16 bits because cwd can have
+              // up to 16 bits of significance information, followed by the
+              // corresponding 16 bits of sign information; therefore, it is
+              // sufficient to fetch 32 bit data per loop.
+
+              // Althougth we are interested in 16 bits only, we load 32 bits.
+              // For the 16 bits we are producing, we need the next 4 bits --
+              // We need data for at least 5 columns out of 8.
+              // Therefore loading 32 bits is easier than loading 16 bits
+              // twice.
+              ui32 ps; memcpy(&ps, prev_sig, sizeof(ps));
+              ui32 ns; memcpy(&ns, cur_sig + mstr, sizeof(ns));
+              ui32 u = (ps & 0x88888888) >> 3; // the row on top
+              if (!stripe_causal)
+                u |= (ns & 0x11111111) << 3;   // the row below
+
+              ui32 cs; memcpy(&cs, cur_sig, sizeof(cs));
+              // vertical integration
+              ui32 mbr =  cs;                // this sig. info.
+              mbr |= (cs & 0x77777777) << 1; //above neighbors
+              mbr |= (cs & 0xEEEEEEEE) >> 1; //below neighbors
+              mbr |= u;
+              // horizontal integration
+              ui32 t = mbr;
+              mbr |= t << 4;      // neighbors on the left
+              mbr |= t >> 4;      // neighbors on the right
+              mbr |= prev >> 12;  // significance of previous group
+
+              // remove outside samples, and already significant samples
+              mbr &= pattern;
+              mbr &= ~cs;
+
+              // find samples that become significant during the SPP
+              ui32 new_sig = mbr;
+              if (new_sig)
+              {
+                v128_t cwd_vec = frwd_fetch<0>(&sigprop);
+                ui32 cwd = vsx_u32x4_extract_lane(cwd_vec, 0);
+
+                ui32 cnt = 0;
+                ui32 col_mask = 0xFu;
+                ui32 inv_sig = ~cs & pattern;
+                for (int i = 0; i < 16; i += 4, col_mask <<= 4)
+                {
+                  if ((col_mask & new_sig) == 0)
+                    continue;
+
+                  //scan one column
+                  ui32 sample_mask = 0x1111u & col_mask;
+                  if (new_sig & sample_mask)
+                  {
+                    new_sig &= ~sample_mask;
+                    if (cwd & 1)
+                    {
+                      ui32 t = 0x33u << i;
+                      new_sig |= t & inv_sig;
+                    }
+                    cwd >>= 1; ++cnt;
+                  }
+
+                  sample_mask <<= 1;
+                  if (new_sig & sample_mask)
+                  {
+                    new_sig &= ~sample_mask;
+                    if (cwd & 1)
+                    {
+                      ui32 t = 0x76u << i;
+                      new_sig |= t & inv_sig;
+                    }
+                    cwd >>= 1; ++cnt;
+                  }
+
+                  sample_mask <<= 1;
+                  if (new_sig & sample_mask)
+                  {
+                    new_sig &= ~sample_mask;
+                    if (cwd & 1)
+                    {
+                      ui32 t = 0xECu << i;
+                      new_sig |= t & inv_sig;
+                    }
+                    cwd >>= 1; ++cnt;
+                  }
+
+                  sample_mask <<= 1;
+                  if (new_sig & sample_mask)
+                  {
+                    new_sig &= ~sample_mask;
+                    if (cwd & 1)
+                    {
+                      ui32 t = 0xC8u << i;
+                      new_sig |= t & inv_sig;
+                    }
+                    cwd >>= 1; ++cnt;
+                  }
+                }
+
+                if (new_sig)
+                {
+                  // Spread new_sig, such that each bit is in one byte with a
+                  // value of 0 if new_sig bit is 0, and 0xFF if new_sig is 1
+                  v128_t new_sig_vec = vsx_i16x8_splat((si16)new_sig);
+                  new_sig_vec = vsx_i8x16_swizzle(new_sig_vec,
+                    vsx_i8x16_const(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1));
+                  new_sig_vec = vsx_v128_and(new_sig_vec,
+                    vsx_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
+                  new_sig_vec = vsx_i8x16_eq(new_sig_vec,
+                    vsx_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
+
+                  // find cumulative sums
+                  // to find which bit in cwd we should extract
+                  v128_t ex_sum, shfl, inc_sum = new_sig_vec; // inclusive scan
+                  inc_sum = vsx_i8x16_abs(inc_sum); // cvrt to 0 or 1
+                  shfl = vsx_i8x16_shuffle(vsx_i64x2_const(0,0), inc_sum,
+                    15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30);
+                  inc_sum = vsx_i8x16_add(inc_sum, shfl);
+                  shfl = vsx_i16x8_shuffle(vsx_i64x2_const(0,0), inc_sum,
+                    7, 8, 9, 10, 11, 12, 13, 14);
+                  inc_sum = vsx_i8x16_add(inc_sum, shfl);
+                  shfl = vsx_i32x4_shuffle(vsx_i64x2_const(0,0), inc_sum,
+                    3, 4, 5, 6);
+                  inc_sum = vsx_i8x16_add(inc_sum, shfl);
+                  shfl = vsx_i64x2_shuffle(vsx_i64x2_const(0,0), inc_sum,
+                    1, 2);
+                  inc_sum = vsx_i8x16_add(inc_sum, shfl);
+                  cnt += vsx_u8x16_extract_lane(inc_sum, 15);
+                  // exclusive scan
+                  ex_sum = vsx_i8x16_shuffle(vsx_i64x2_const(0,0), inc_sum,
+                    15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30);
+
+                  // Spread cwd, such that each bit is in one byte
+                  // with a value of 0 or 1.
+                  cwd_vec = vsx_i16x8_splat((si16)cwd);
+                  cwd_vec = vsx_i8x16_swizzle(cwd_vec,
+                    vsx_i8x16_const(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1));
+                  cwd_vec = vsx_v128_and(cwd_vec,
+                    vsx_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
+                  cwd_vec = vsx_i8x16_eq(cwd_vec,
+                    vsx_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
+                  cwd_vec = vsx_i8x16_abs(cwd_vec);
+
+                  // Obtain bit from cwd_vec correspondig to ex_sum
+                  // Basically, collect needed bits from cwd_vec
+                  v128_t v = vsx_i8x16_swizzle(cwd_vec, ex_sum);
+
+                  // load data and set spp coefficients
+                  v128_t m = vsx_i8x16_const(
+                    0,-1,-1,-1,4,-1,-1,-1,8,-1,-1,-1,12,-1,-1,-1);
+                  v128_t val = vsx_i32x4_splat(3 << (p - 2));
+                  ui32 *dp = dpp;
+                  for (int c = 0; c < 4; ++ c) {
+                    v128_t s0, s0_ns, s0_val;
+                    // load coefficients
+                    s0 = vsx_v128_load(dp);
+
+                    // epi32 is -1 only for coefficient that
+                    // are changed during the SPP
+                    s0_ns = vsx_i8x16_swizzle(new_sig_vec, m);
+                    s0_ns = vsx_i32x4_eq(s0_ns,
+                      vsx_i32x4_const(OJPH_REPEAT4(0xFF)));
+
+                    // obtain sign for coefficients in SPP
+                    s0_val = vsx_i8x16_swizzle(v, m);
+                    s0_val = vsx_i32x4_shl(s0_val, 31);
+                    s0_val = vsx_v128_or(s0_val, val);
+                    s0_val = vsx_v128_and(s0_val, s0_ns);
+
+                    // update vector
+                    s0 = vsx_v128_or(s0, s0_val);
+                    // store coefficients
+                    vsx_v128_store(dp, s0);
+                    // prepare for next row
+                    dp += stride;
+                    m = vsx_i32x4_add(m, vsx_i32x4_const(OJPH_REPEAT4(1)));
+                  }
+                }
+                frwd_advance(&sigprop, cnt);
+              }
+
+              new_sig |= cs;
+              *prev_sig = (ui16)(new_sig);
+
+              // vertical integration for the new sig. info.
+              t = new_sig;
+              new_sig |= (t & 0x7777) << 1; //above neighbors
+              new_sig |= (t & 0xEEEE) >> 1; //below neighbors
+              // add sig. info. from the row on top and below
+              prev = new_sig | u;
+              // we need only the bits in 0xF000
+              prev &= 0xF000;
+            }
+          }
+        }
+
+        // We perform Magnitude Refinement Pass here
+        if (num_passes > 2)
+        {
+          rev_struct magref;
+          rev_init_mrp(&magref, coded_data, (int)lengths1, (int)lengths2);
+
+          for (ui32 y = 0; y < height; y += 4)
+          {
+            ui16 *cur_sig = sigma + (y >> 2) * mstr;
+            ui32 *dpp = decoded_data + y * stride;
+            for (ui32 i = 0; i < width; i += 4, dpp += 4)
+            {
+              //Process one entry from sigma array at a time
+              // Each nibble (4 bits) in the sigma array represents 4 rows,
+              ui32 cwd = rev_fetch_mrp(&magref); // get 32 bit data
+              ui16 sig = *cur_sig++; // 16 bit that will be processed now
+              int total_bits = 0;
+              if (sig) // if any of the 32 bits are set
+              {
+                // We work on 4 rows, with 4 samples each, since
+                // data is 32 bit (4 bytes)
+
+                // spread the 16 bits in sig to 0 or 1 bytes in sig_vec
+                v128_t sig_vec = vsx_i16x8_splat((si16)sig);
+                sig_vec = vsx_i8x16_swizzle(sig_vec,
+                  vsx_i8x16_const(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1));
+                sig_vec = vsx_v128_and(sig_vec,
+                  vsx_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
+                sig_vec = vsx_i8x16_eq(sig_vec,
+                  vsx_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
+                sig_vec = vsx_i8x16_abs(sig_vec);
+
+                // find cumulative sums
+                // to find which bit in cwd we should extract
+                v128_t ex_sum, shfl, inc_sum = sig_vec; // inclusive scan
+                shfl = vsx_i8x16_shuffle(vsx_i64x2_const(0,0), inc_sum,
+                  15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30);
+                inc_sum = vsx_i8x16_add(inc_sum, shfl);
+                shfl = vsx_i16x8_shuffle(vsx_i64x2_const(0,0), inc_sum,
+                  7, 8, 9, 10, 11, 12, 13, 14);
+                inc_sum = vsx_i8x16_add(inc_sum, shfl);
+                shfl = vsx_i32x4_shuffle(vsx_i64x2_const(0,0), inc_sum,
+                  3, 4, 5, 6);
+                inc_sum = vsx_i8x16_add(inc_sum, shfl);
+                shfl = vsx_i64x2_shuffle(vsx_i64x2_const(0,0), inc_sum,
+                  1, 2);
+                inc_sum = vsx_i8x16_add(inc_sum, shfl);
+                total_bits = vsx_u8x16_extract_lane(inc_sum, 15);
+                // exclusive scan
+                ex_sum = vsx_i8x16_shuffle(vsx_i64x2_const(0,0), inc_sum,
+                  15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30);
+
+                // Spread the 16 bits in cwd to inverted 0 or 1 bytes in
+                // cwd_vec. Then, convert these to a form suitable
+                // for coefficient modifications; in particular, a value
+                // of 0 is presented as binary 11, and a value of 1 is
+                // represented as binary 01
+                v128_t cwd_vec = vsx_i16x8_splat((si16)cwd);
+                cwd_vec = vsx_i8x16_swizzle(cwd_vec,
+                  vsx_i8x16_const(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1));
+                cwd_vec = vsx_v128_and(cwd_vec,
+                  vsx_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
+                cwd_vec = vsx_i8x16_eq(cwd_vec,
+                  vsx_u64x2_const(OJPH_REPEAT2(0x8040201008040201)));
+                cwd_vec =
+                  vsx_i8x16_add(cwd_vec, vsx_i8x16_const(OJPH_REPEAT16(1)));
+                cwd_vec = vsx_i8x16_add(cwd_vec, cwd_vec);
+                cwd_vec =
+                  vsx_v128_or(cwd_vec, vsx_i8x16_const(OJPH_REPEAT16(1)));
+
+                // load data and insert the mrp bit
+                v128_t m = vsx_i8x16_const(0,-1,-1,-1,4,-1,-1,-1,
+                                            8,-1,-1,-1,12,-1,-1,-1);
+                ui32 *dp = dpp;
+                for (int c = 0; c < 4; ++c) {
+                  v128_t s0, s0_sig, s0_idx, s0_val;
+                  // load coefficients
+                  s0 = vsx_v128_load(dp);
+                  // find significant samples in this row
+                  s0_sig = vsx_i8x16_swizzle(sig_vec, m);
+                  s0_sig = vsx_i8x16_eq(s0_sig, vsx_i64x2_const(0, 0));
+                  // get MRP bit index, and MRP pattern
+                  s0_idx = vsx_i8x16_swizzle(ex_sum, m);
+                  s0_val = vsx_i8x16_swizzle(cwd_vec, s0_idx);
+                  // keep data from significant samples only
+                  s0_val = vsx_v128_andnot(s0_val, s0_sig);
+                  // move mrp bits to correct position, and employ
+                  s0_val = vsx_i32x4_shl(s0_val, p - 2);
+                  s0 = vsx_v128_xor(s0, s0_val);
+                  // store coefficients
+                  vsx_v128_store(dp, s0);
+                  // prepare for next row
+                  dp += stride;
+                  m = vsx_i32x4_add(m, vsx_i32x4_const(OJPH_REPEAT4(1)));
+                }
+              }
+              // consume data according to the number of bits set
+              rev_advance_mrp(&magref, (ui32)total_bits);
+            }
+          }
+        }
+      }
+
+      return true;
+    }
+  }
+}
diff --git a/src/core/openjph/ojph_arch.h b/src/core/openjph/ojph_arch.h
index 84c2834d..f8ba7992 100644
--- a/src/core/openjph/ojph_arch.h
+++ b/src/core/openjph/ojph_arch.h
@@ -40,6 +40,7 @@
 #ifndef OJPH_ARCH_H
 #define OJPH_ARCH_H
 
+#include <cstring>
 #include <cstdio>
 #include <cstdint>
 #include <cmath>
@@ -100,12 +101,20 @@
   #define OJPH_ARCH_UNKNOWN
 #endif
 
+// Only little-endian POWER (ppc64le) is supported for SIMD
+#if defined(OJPH_ARCH_PPC64) &&  \
+  (defined(__LITTLE_ENDIAN__) ||  \
+   (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+  #define OJPH_ARCH_PPC64LE
+#endif
+
 namespace ojph {
   ////////////////////////////////////////////////////////////////////////////
   //                  disable SIMD for unknown architecture
   ////////////////////////////////////////////////////////////////////////////
 #if !defined(OJPH_ARCH_X86_64) && !defined(OJPH_ARCH_I386) &&  \
-    !defined(OJPH_ARCH_ARM) && !defined(OJPH_DISABLE_SIMD)
+    !defined(OJPH_ARCH_ARM) && !defined(OJPH_ARCH_PPC64LE) &&  \
+    !defined(OJPH_DISABLE_SIMD)
 #define OJPH_DISABLE_SIMD
 #endif // !OJPH_ARCH_UNKNOWN
 
@@ -164,6 +173,14 @@ namespace ojph {
     ARM_CPU_EXT_LEVEL_SVE2 = 3,
   };
 
+  // POWER9 (ISA 3.0) is the minimum supported SIMD level; older CPUs
+  // (POWER8 and earlier) use the generic code paths
+  enum : int {
+    PPC_CPU_EXT_LEVEL_GENERIC = 0,
+    PPC_CPU_EXT_LEVEL_ARCH_3_00 = 1, // ISA 3.0  (POWER9)
+    PPC_CPU_EXT_LEVEL_ARCH_3_1 = 2,  // ISA 3.1  (POWER10)
+  };
+
   /////////////////////////////////////////////////////////////////////////////
   static inline ui32 population_count(ui32 val)
   {
@@ -451,8 +468,11 @@ namespace ojph {
   // result, irrespective of the machine's endianness
   static inline ui32 load_le_ui32(const ui8 *p)
   {
-    if (is_machine_little_endian)
-      return *(const ui32 *)p;
+    if (is_machine_little_endian) {
+      ui32 val;
+      std::memcpy(&val, p, sizeof(val));
+      return val;
+    }
     else
       return (ui32)p[0] | ((ui32)p[1] << 8)
         | ((ui32)p[2] << 16) | ((ui32)p[3] << 24);
@@ -464,8 +484,11 @@ namespace ojph {
   // of the machine's endianness
   static inline ui32 load_le_ui16x2(const ui16 *p)
   {
-    if (is_machine_little_endian)
-      return *(const ui32 *)p;
+    if (is_machine_little_endian) {
+      ui32 val;
+      std::memcpy(&val, p, sizeof(val));
+      return val;
+    }
     else
       return (ui32)p[0] | ((ui32)p[1] << 16);
   }
diff --git a/src/core/openjph/ojph_simd_vsx.h b/src/core/openjph/ojph_simd_vsx.h
new file mode 100644
index 00000000..fae483d8
--- /dev/null
+++ b/src/core/openjph/ojph_simd_vsx.h
@@ -0,0 +1,361 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2026, Aous Naman
+// Copyright (c) 2026, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2026, The University of New South Wales, Australia
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: ojph_simd_vsx.h
+//
+// 128-bit SIMD helpers for POWER VSX, used by the ojph_*_vsx.cpp
+// kernels.  Lane numbering and operation semantics follow the same
+// conventions as the other 128-bit kernels in this codebase (lane 0
+// is the lowest memory address).  Supported targets are POWER9
+// (ISA 3.0) and newer, little-endian only (ppc64le).
+//***************************************************************************/
+
+#ifndef OJPH_SIMD_VSX_H
+#define OJPH_SIMD_VSX_H
+
+#if !defined(__powerpc64__) && !defined(__PPC64__)
+  #error "this header is for 64-bit POWER targets only"
+#endif
+#if !defined(__LITTLE_ENDIAN__) && \
+    !(defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+  #error "this header assumes a little-endian target (ppc64le)"
+#endif
+
+#include <altivec.h>
+#include <cstring>
+
+#include "ojph_defs.h"
+
+// altivec.h leaks these context-sensitive keywords as macros under GNU C;
+// they break standard headers and the codebase (e.g. std::vector)
+#undef vector
+#undef pixel
+#undef bool
+
+typedef __vector unsigned char v128_t;
+
+typedef __vector signed char        vsx_v_i8;
+typedef __vector unsigned char      vsx_v_u8;
+typedef __vector signed short       vsx_v_i16;
+typedef __vector unsigned short     vsx_v_u16;
+typedef __vector signed int         vsx_v_i32;
+typedef __vector unsigned int       vsx_v_u32;
+typedef __vector signed long long   vsx_v_i64;
+typedef __vector unsigned long long vsx_v_u64;
+typedef __vector float              vsx_v_f32;
+
+//---------------------------------------------------------------------------
+// load/store (alignment-agnostic; lxv/stxv handle unaligned addresses)
+//---------------------------------------------------------------------------
+static inline v128_t vsx_v128_load(const void *p)
+{ return vec_xl(0, (const unsigned char *)p); }
+
+static inline void vsx_v128_store(void *p, v128_t a)
+{ vec_xst(a, 0, (unsigned char *)p); }
+
+#define vsx_v128_store32_lane(p, a, i) \
+  do { vsx_v_i32 t_ = (vsx_v_i32)(a); int v_ = t_[(i)]; \
+       std::memcpy((p), &v_, 4); } while (0)
+
+//---------------------------------------------------------------------------
+// constants, splats, makes
+//---------------------------------------------------------------------------
+// functions, not macros, so that an argument that is itself a macro
+// expanding to an argument list (e.g. OJPH_REPEAT4) works
+static inline v128_t vsx_i8x16_const(
+  signed char c0, signed char c1, signed char c2, signed char c3,
+  signed char c4, signed char c5, signed char c6, signed char c7,
+  signed char c8, signed char c9, signed char c10, signed char c11,
+  signed char c12, signed char c13, signed char c14, signed char c15)
+{ vsx_v_i8 v = {c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15};
+  return (v128_t)v; }
+static inline v128_t vsx_i16x8_const(short c0, short c1, short c2,
+                                      short c3, short c4, short c5,
+                                      short c6, short c7)
+{ vsx_v_i16 v = {c0,c1,c2,c3,c4,c5,c6,c7}; return (v128_t)v; }
+static inline v128_t vsx_u16x8_const(unsigned short c0, unsigned short c1,
+                                      unsigned short c2, unsigned short c3,
+                                      unsigned short c4, unsigned short c5,
+                                      unsigned short c6, unsigned short c7)
+{ vsx_v_u16 v = {c0,c1,c2,c3,c4,c5,c6,c7}; return (v128_t)v; }
+static inline v128_t vsx_i32x4_const(int c0, int c1, int c2, int c3)
+{ vsx_v_i32 v = {c0,c1,c2,c3}; return (v128_t)v; }
+static inline v128_t vsx_u32x4_const(unsigned int c0, unsigned int c1,
+                                      unsigned int c2, unsigned int c3)
+{ vsx_v_u32 v = {c0,c1,c2,c3}; return (v128_t)v; }
+static inline v128_t vsx_i64x2_const(long long c0, long long c1)
+{ vsx_v_i64 v = {c0,c1}; return (v128_t)v; }
+static inline v128_t vsx_u64x2_const(unsigned long long c0,
+                                      unsigned long long c1)
+{ vsx_v_u64 v = {c0,c1}; return (v128_t)v; }
+
+static inline v128_t vsx_i8x16_splat(signed char x)
+{ ojph_unused(x); return (v128_t)vec_splats(x); }
+static inline v128_t vsx_i16x8_splat(short x)
+{ ojph_unused(x); return (v128_t)vec_splats(x); }
+static inline v128_t vsx_i32x4_splat(int x)
+{ ojph_unused(x); return (v128_t)vec_splats(x); }
+static inline v128_t vsx_u32x4_splat(unsigned int x)
+{ ojph_unused(x); return (v128_t)vec_splats(x); }
+static inline v128_t vsx_i64x2_splat(long long x)
+{ ojph_unused(x); return (v128_t)vec_splats((signed long long)x); }
+static inline v128_t vsx_f32x4_splat(float x)
+{ ojph_unused(x); return (v128_t)vec_splats(x); }
+
+static inline v128_t vsx_i32x4_make(int a, int b, int c, int d)
+{ return (v128_t)(vsx_v_i32){a, b, c, d}; }
+
+//---------------------------------------------------------------------------
+// lane extraction (subscript is little-endian lane order)
+//---------------------------------------------------------------------------
+#define vsx_u8x16_extract_lane(a, i)  (((vsx_v_u8)(a))[(i)])
+#define vsx_u16x8_extract_lane(a, i)  (((vsx_v_u16)(a))[(i)])
+#define vsx_i32x4_extract_lane(a, i)  (((vsx_v_i32)(a))[(i)])
+#define vsx_u32x4_extract_lane(a, i)  (((vsx_v_u32)(a))[(i)])
+#define vsx_i64x2_extract_lane(a, i)  (((vsx_v_i64)(a))[(i)])
+
+//---------------------------------------------------------------------------
+// bitwise
+//---------------------------------------------------------------------------
+static inline v128_t vsx_v128_and(v128_t a, v128_t b)
+{ return vec_and(a, b); }
+static inline v128_t vsx_v128_or(v128_t a, v128_t b)
+{ return vec_or(a, b); }
+static inline v128_t vsx_v128_xor(v128_t a, v128_t b)
+{ return vec_xor(a, b); }
+// a & ~b  (same operand order as vec_andc)
+static inline v128_t vsx_v128_andnot(v128_t a, v128_t b)
+{ return vec_andc(a, b); }
+
+//---------------------------------------------------------------------------
+// integer arithmetic
+//---------------------------------------------------------------------------
+static inline v128_t vsx_i8x16_add(v128_t a, v128_t b)
+{ return (v128_t)vec_add((vsx_v_i8)a, (vsx_v_i8)b); }
+static inline v128_t vsx_i16x8_add(v128_t a, v128_t b)
+{ return (v128_t)vec_add((vsx_v_i16)a, (vsx_v_i16)b); }
+static inline v128_t vsx_i32x4_add(v128_t a, v128_t b)
+{ return (v128_t)vec_add((vsx_v_i32)a, (vsx_v_i32)b); }
+static inline v128_t vsx_i64x2_add(v128_t a, v128_t b)
+{ return (v128_t)vec_add((vsx_v_i64)a, (vsx_v_i64)b); }
+
+static inline v128_t vsx_i16x8_sub(v128_t a, v128_t b)
+{ return (v128_t)vec_sub((vsx_v_i16)a, (vsx_v_i16)b); }
+static inline v128_t vsx_i32x4_sub(v128_t a, v128_t b)
+{ return (v128_t)vec_sub((vsx_v_i32)a, (vsx_v_i32)b); }
+static inline v128_t vsx_i64x2_sub(v128_t a, v128_t b)
+{ return (v128_t)vec_sub((vsx_v_i64)a, (vsx_v_i64)b); }
+
+// low half of products; vmladduhm / vmuluwm; i64x2 is lowered by the
+// compiler (mulld on ISA 3.0, vmulld on ISA 3.1)
+static inline v128_t vsx_i16x8_mul(v128_t a, v128_t b)
+{ return (v128_t)((vsx_v_i16)a * (vsx_v_i16)b); }
+static inline v128_t vsx_i32x4_mul(v128_t a, v128_t b)
+{ return (v128_t)((vsx_v_i32)a * (vsx_v_i32)b); }
+static inline v128_t vsx_i64x2_mul(v128_t a, v128_t b)
+{ return (v128_t)((vsx_v_i64)a * (vsx_v_i64)b); }
+
+static inline v128_t vsx_i8x16_abs(v128_t a)
+{ return (v128_t)vec_abs((vsx_v_i8)a); }
+static inline v128_t vsx_u8x16_min(v128_t a, v128_t b)
+{ return (v128_t)vec_min((vsx_v_u8)a, (vsx_v_u8)b); }
+static inline v128_t vsx_i16x8_max(v128_t a, v128_t b)
+{ return (v128_t)vec_max((vsx_v_i16)a, (vsx_v_i16)b); }
+
+//---------------------------------------------------------------------------
+// shifts (scalar count, modulo lane width)
+//---------------------------------------------------------------------------
+static inline v128_t vsx_i16x8_shl(v128_t a, int n)
+{ return (v128_t)vec_sl((vsx_v_i16)a, vec_splats((unsigned short)n)); }
+static inline v128_t vsx_i32x4_shl(v128_t a, int n)
+{ return (v128_t)vec_sl((vsx_v_i32)a, vec_splats((unsigned int)n)); }
+static inline v128_t vsx_i64x2_shl(v128_t a, int n)
+{ return (v128_t)vec_sl((vsx_v_i64)a,
+                        vec_splats((unsigned long long)n)); }
+
+static inline v128_t vsx_i32x4_shr(v128_t a, int n)   // arithmetic
+{ return (v128_t)vec_sra((vsx_v_i32)a, vec_splats((unsigned int)n)); }
+static inline v128_t vsx_i64x2_shr(v128_t a, int n)   // arithmetic
+{ return (v128_t)vec_sra((vsx_v_i64)a,
+                         vec_splats((unsigned long long)n)); }
+
+static inline v128_t vsx_u16x8_shr(v128_t a, int n)   // logical
+{ return (v128_t)vec_sr((vsx_v_u16)a, vec_splats((unsigned short)n)); }
+static inline v128_t vsx_u32x4_shr(v128_t a, int n)   // logical
+{ return (v128_t)vec_sr((vsx_v_u32)a, vec_splats((unsigned int)n)); }
+static inline v128_t vsx_u64x2_shr(v128_t a, int n)   // logical
+{ return (v128_t)vec_sr((vsx_v_u64)a,
+                        vec_splats((unsigned long long)n)); }
+
+//---------------------------------------------------------------------------
+// comparisons (true lanes -> all-ones, false lanes -> all-zeros)
+//---------------------------------------------------------------------------
+static inline v128_t vsx_i8x16_eq(v128_t a, v128_t b)
+{ return (v128_t)vec_cmpeq((vsx_v_i8)a, (vsx_v_i8)b); }
+static inline v128_t vsx_i16x8_eq(v128_t a, v128_t b)
+{ return (v128_t)vec_cmpeq((vsx_v_i16)a, (vsx_v_i16)b); }
+static inline v128_t vsx_i32x4_eq(v128_t a, v128_t b)
+{ return (v128_t)vec_cmpeq((vsx_v_i32)a, (vsx_v_i32)b); }
+
+static inline v128_t vsx_i8x16_gt(v128_t a, v128_t b)
+{ return (v128_t)vec_cmpgt((vsx_v_i8)a, (vsx_v_i8)b); }
+static inline v128_t vsx_i32x4_gt(v128_t a, v128_t b)
+{ return (v128_t)vec_cmpgt((vsx_v_i32)a, (vsx_v_i32)b); }
+static inline v128_t vsx_i32x4_lt(v128_t a, v128_t b)
+{ return (v128_t)vec_cmplt((vsx_v_i32)a, (vsx_v_i32)b); }
+static inline v128_t vsx_i64x2_lt(v128_t a, v128_t b)
+{ return (v128_t)vec_cmplt((vsx_v_i64)a, (vsx_v_i64)b); }
+
+static inline v128_t vsx_f32x4_ge(v128_t a, v128_t b)
+{ return (v128_t)vec_cmpge((vsx_v_f32)a, (vsx_v_f32)b); }
+static inline v128_t vsx_f32x4_lt(v128_t a, v128_t b)
+{ return (v128_t)vec_cmplt((vsx_v_f32)a, (vsx_v_f32)b); }
+
+//---------------------------------------------------------------------------
+// float arithmetic and conversions
+//---------------------------------------------------------------------------
+static inline v128_t vsx_f32x4_add(v128_t a, v128_t b)
+{ return (v128_t)vec_add((vsx_v_f32)a, (vsx_v_f32)b); }
+static inline v128_t vsx_f32x4_sub(v128_t a, v128_t b)
+{ return (v128_t)vec_sub((vsx_v_f32)a, (vsx_v_f32)b); }
+static inline v128_t vsx_f32x4_mul(v128_t a, v128_t b)
+{ return (v128_t)vec_mul((vsx_v_f32)a, (vsx_v_f32)b); }
+
+// xvcvspsxws: truncating, saturating (NaN gives 0x80000000; the
+// callers never pass NaN)
+static inline v128_t vsx_i32x4_trunc_sat_f32x4(v128_t a)
+{ return (v128_t)vec_cts((vsx_v_f32)a, 0); }
+static inline v128_t vsx_f32x4_convert_i32x4(v128_t a)
+{ return (v128_t)vec_ctf((vsx_v_i32)a, 0); }
+
+//---------------------------------------------------------------------------
+// widening
+//---------------------------------------------------------------------------
+static inline v128_t vsx_i64x2_extend_low_i32x4(v128_t a)
+{
+  // vsx_v_i32 v = (vsx_v_i32)a;
+  // return (v128_t)__builtin_convertvector(
+  //   __builtin_shufflevector(v, v, 0, 1), vsx_v_i64);
+
+  // Unpacks and sign-extends elements 0 and 1 on Little Endian
+  return (v128_t)vec_unpackl((vsx_v_i32)a);
+}
+static inline v128_t vsx_i64x2_extend_high_i32x4(v128_t a)
+{
+  // vsx_v_i32 v = (vsx_v_i32)a;
+  // return (v128_t)__builtin_convertvector(
+  //   __builtin_shufflevector(v, v, 2, 3), vsx_v_i64);
+
+  // Unpacks and sign-extends elements 2 and 3 on Little Endian
+  return (v128_t)vec_unpackh((vsx_v_i32)a);
+}
+
+//---------------------------------------------------------------------------
+// shuffles (immediate lane indices; 0..N-1 from a, N..2N-1 from b)
+//---------------------------------------------------------------------------
+// #define vsx_i8x16_shuffle(a, b, c0,c1,c2,c3,c4,c5,c6,c7,
+//                                  c8,c9,c10,c11,c12,c13,c14,c15)
+//   ((v128_t)__builtin_shufflevector((vsx_v_u8)(a), (vsx_v_u8)(b),
+//     c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15))
+// #define vsx_i16x8_shuffle(a, b, c0,c1,c2,c3,c4,c5,c6,c7)
+//   ((v128_t)__builtin_shufflevector((vsx_v_i16)(a), (vsx_v_i16)(b),
+//     c0,c1,c2,c3,c4,c5,c6,c7))
+// #define vsx_i32x4_shuffle(a, b, c0,c1,c2,c3)
+//   ((v128_t)__builtin_shufflevector((vsx_v_i32)(a), (vsx_v_i32)(b),
+//     c0,c1,c2,c3))
+// #define vsx_i64x2_shuffle(a, b, c0,c1)
+//   ((v128_t)__builtin_shufflevector((vsx_v_i64)(a), (vsx_v_i64)(b), c0,c1))
+
+// 8-bit Shuffle (Maps direct element indices to raw byte indices)
+#define vsx_i8x16_shuffle(a, b, c0,c1,c2,c3,c4,c5,c6,c7, \
+                                 c8,c9,c10,c11,c12,c13,c14,c15) \
+  ((v128_t)vec_perm((vsx_v_u8)(a), (vsx_v_u8)(b), (vsx_v_u8){ \
+    (c0), (c1), (c2), (c3), (c4), (c5), (c6), (c7), \
+    (c8), (c9), (c10),(c11),(c12),(c13),(c14),(c15) \
+  }))
+
+// 16-bit Shuffle (Multiplies element index by 2 to get byte offsets)
+#define vsx_i16x8_shuffle(a, b, c0,c1,c2,c3,c4,c5,c6,c7) \
+  ((v128_t)vec_perm((vsx_v_u8)(a), (vsx_v_u8)(b), (vsx_v_u8){ \
+    (c0)*2, (c0)*2+1,  (c1)*2, (c1)*2+1, \
+    (c2)*2, (c2)*2+1,  (c3)*2, (c3)*2+1, \
+    (c4)*2, (c4)*2+1,  (c5)*2, (c5)*2+1, \
+    (c6)*2, (c6)*2+1,  (c7)*2, (c7)*2+1  \
+  }))
+
+// 32-bit Shuffle (Multiplies element index by 4 to get byte offsets)
+#define vsx_i32x4_shuffle(a, b, c0,c1,c2,c3) \
+  ((v128_t)vec_perm((vsx_v_u8)(a), (vsx_v_u8)(b), (vsx_v_u8){ \
+    (c0)*4, (c0)*4+1, (c0)*4+2, (c0)*4+3, \
+    (c1)*4, (c1)*4+1, (c1)*4+2, (c1)*4+3, \
+    (c2)*4, (c2)*4+1, (c2)*4+2, (c2)*4+3, \
+    (c3)*4, (c3)*4+1, (c3)*4+2, (c3)*4+3  \
+  }))
+
+// 64-bit Shuffle (Multiplies element index by 8 to get byte offsets)
+#define vsx_i64x2_shuffle(a, b, c0,c1) \
+  ((v128_t)vec_perm((vsx_v_u8)(a), (vsx_v_u8)(b), (vsx_v_u8){ \
+    (c0)*8, (c0)*8+1, (c0)*8+2, (c0)*8+3,  \
+    (c0)*8+4, (c0)*8+5, (c0)*8+6, (c0)*8+7, \
+    (c1)*8, (c1)*8+1, (c1)*8+2, (c1)*8+3,  \
+    (c1)*8+4, (c1)*8+5, (c1)*8+6, (c1)*8+7  \
+  }))
+
+//---------------------------------------------------------------------------
+// swizzle: runtime byte-table lookup; lanes with index > 15 give 0
+//---------------------------------------------------------------------------
+static inline v128_t vsx_i8x16_swizzle(v128_t a, v128_t idx)
+{
+  v128_t r = vec_perm(a, a, idx);
+  v128_t oob = (v128_t)vec_cmpgt((vsx_v_u8)idx,
+                                 vec_splats((unsigned char)15));
+  return vec_andc(r, oob);
+}
+
+//---------------------------------------------------------------------------
+// bitmask: MSB of each byte lane -> bit of result, lane 0 -> bit 0
+// (vbpermq gathers the 16 selected bits into bits 48..63 of the
+// big-endian first doubleword, which is doubleword 1 on ppc64le)
+//---------------------------------------------------------------------------
+static inline int vsx_i8x16_bitmask(v128_t a)
+{
+#if defined(__POWER10_VECTOR__)
+  return (int)vec_extractm(a);   // ISA 3.1 native movemask
+#else
+  const vsx_v_u8 perm = { 120, 112, 104, 96, 88, 80, 72, 64,
+                            56,  48,  40, 32, 24, 16,  8,  0 };
+  vsx_v_u64 r = (vsx_v_u64)vec_bperm(a, perm);
+  return (int)r[1];
+#endif
+}
+
+#endif // OJPH_SIMD_VSX_H
diff --git a/src/core/others/ojph_arch.cpp b/src/core/others/ojph_arch.cpp
index 1f062041..0ab12b8a 100644
--- a/src/core/others/ojph_arch.cpp
+++ b/src/core/others/ojph_arch.cpp
@@ -229,7 +229,38 @@ namespace ojph {
 
     #endif
 
-  #else // architectures other than Intel/AMD and ARM
+  #elif defined(OJPH_ARCH_PPC64LE)
+
+    #if defined(OJPH_OS_LINUX)
+
+      #include <sys/auxv.h>
+      #include <asm/cputable.h>
+
+      bool init_cpu_ext_level(int& level) {
+        unsigned long hwcap = getauxval(AT_HWCAP);
+        unsigned long hwcap2 = getauxval(AT_HWCAP2);
+        level = PPC_CPU_EXT_LEVEL_GENERIC;
+        if ((hwcap & PPC_FEATURE_HAS_VSX) &&
+            (hwcap2 & PPC_FEATURE2_ARCH_3_00)) {
+          level = PPC_CPU_EXT_LEVEL_ARCH_3_00;
+        #ifdef PPC_FEATURE2_ARCH_3_1
+          if (hwcap2 & PPC_FEATURE2_ARCH_3_1)
+            level = PPC_CPU_EXT_LEVEL_ARCH_3_1;
+        #endif
+        }
+        return true;
+      }
+
+    #else // !OJPH_OS_LINUX
+
+      bool init_cpu_ext_level(int& level) {
+        level = PPC_CPU_EXT_LEVEL_GENERIC;
+        return true;
+      }
+
+    #endif
+
+  #else // architectures other than Intel/AMD, ARM, and PPC64LE
 
   ////////////////////////////////////////////////////////////////////////////
   bool init_cpu_ext_level(int& level) {
diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp
index ef147ade..6e9d9709 100644
--- a/src/core/transform/ojph_colour.cpp
+++ b/src/core/transform/ojph_colour.cpp
@@ -177,6 +177,25 @@ namespace ojph {
 
     #elif defined(OJPH_ARCH_ARM)
 
+    #elif defined(OJPH_ARCH_PPC64LE)
+
+        if (get_cpu_ext_level() >= PPC_CPU_EXT_LEVEL_ARCH_3_00)
+        {
+          // 128-bit VSX kernels; see ojph_simd_vsx.h
+          rev_convert = vsx_rev_convert;
+          rev_convert_nlt_type3 = vsx_rev_convert_nlt_type3;
+          irv_convert_to_integer = vsx_irv_convert_to_integer;
+          irv_convert_to_float = vsx_irv_convert_to_float;
+          irv_convert_to_integer_nlt_type3 =
+            vsx_irv_convert_to_integer_nlt_type3;
+          irv_convert_to_float_nlt_type3 =
+            vsx_irv_convert_to_float_nlt_type3;
+          rct_forward = vsx_rct_forward;
+          rct_backward = vsx_rct_backward;
+          ict_forward = vsx_ict_forward;
+          ict_backward = vsx_ict_backward;
+        }
+
     #endif // !(defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386))
 
   #endif // !OJPH_DISABLE_SIMD
diff --git a/src/core/transform/ojph_colour_local.h b/src/core/transform/ojph_colour_local.h
index a85bf8bd..abe7eea0 100644
--- a/src/core/transform/ojph_colour_local.h
+++ b/src/core/transform/ojph_colour_local.h
@@ -2,21 +2,21 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Aous Naman
 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
 // Copyright (c) 2019, The University of New South Wales, Australia
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
-// 
+//
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -66,14 +66,14 @@ namespace ojph {
 
     //////////////////////////////////////////////////////////////////////////
     void gen_rev_convert(
-      const line_buf *src_line, const ui32 src_line_offset, 
-      line_buf *dst_line, const ui32 dst_line_offset, 
+      const line_buf *src_line, const ui32 src_line_offset,
+      line_buf *dst_line, const ui32 dst_line_offset,
       si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
     void gen_rev_convert_nlt_type3(
-      const line_buf *src_line, const ui32 src_line_offset, 
-      line_buf *dst_line, const ui32 dst_line_offset, 
+      const line_buf *src_line, const ui32 src_line_offset,
+      line_buf *dst_line, const ui32 dst_line_offset,
       si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
@@ -158,14 +158,14 @@ namespace ojph {
 
     //////////////////////////////////////////////////////////////////////////
     void sse2_rev_convert(
-      const line_buf *src_line, const ui32 src_line_offset, 
-      line_buf *dst_line, const ui32 dst_line_offset, 
+      const line_buf *src_line, const ui32 src_line_offset,
+      line_buf *dst_line, const ui32 dst_line_offset,
       si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
     void sse2_rev_convert_nlt_type3(
-      const line_buf *src_line, const ui32 src_line_offset, 
-      line_buf *dst_line, const ui32 dst_line_offset, 
+      const line_buf *src_line, const ui32 src_line_offset,
+      line_buf *dst_line, const ui32 dst_line_offset,
       si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
@@ -214,14 +214,14 @@ namespace ojph {
 
     //////////////////////////////////////////////////////////////////////////
     void avx2_rev_convert(
-      const line_buf *src_line, const ui32 src_line_offset, 
-      line_buf *dst_line, const ui32 dst_line_offset, 
+      const line_buf *src_line, const ui32 src_line_offset,
+      line_buf *dst_line, const ui32 dst_line_offset,
       si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
     void avx2_rev_convert_nlt_type3(
-      const line_buf *src_line, const ui32 src_line_offset, 
-      line_buf *dst_line, const ui32 dst_line_offset, 
+      const line_buf *src_line, const ui32 src_line_offset,
+      line_buf *dst_line, const ui32 dst_line_offset,
       si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
@@ -257,7 +257,7 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     //
     //
-    //                              WASM Functions 
+    //                              WASM Functions
     //
     //
     //////////////////////////////////////////////////////////////////////////
@@ -274,14 +274,14 @@ namespace ojph {
 
     //////////////////////////////////////////////////////////////////////////
     void wasm_rev_convert(
-      const line_buf *src_line, const ui32 src_line_offset, 
-      line_buf *dst_line, const ui32 dst_line_offset, 
+      const line_buf *src_line, const ui32 src_line_offset,
+      line_buf *dst_line, const ui32 dst_line_offset,
       si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
     void wasm_rev_convert_nlt_type3(
-      const line_buf *src_line, const ui32 src_line_offset, 
-      line_buf *dst_line, const ui32 dst_line_offset, 
+      const line_buf *src_line, const ui32 src_line_offset,
+      line_buf *dst_line, const ui32 dst_line_offset,
       si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
@@ -312,6 +312,64 @@ namespace ojph {
     void wasm_ict_backward(const float *y, const float *cb, const float *cr,
                            float *r, float *g, float *b, ui32 repeat);
 
+    //////////////////////////////////////////////////////////////////////////
+    //
+    //
+    //                               VSX Functions
+    //
+    //
+    //////////////////////////////////////////////////////////////////////////
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_irv_convert_to_integer(
+      const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width);
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_irv_convert_to_float(
+      const line_buf *src_line, ui32 src_line_offset,
+      line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width);
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_rev_convert(
+      const line_buf *src_line, const ui32 src_line_offset,
+      line_buf *dst_line, const ui32 dst_line_offset,
+      si64 shift, ui32 width);
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_rev_convert_nlt_type3(
+      const line_buf *src_line, const ui32 src_line_offset,
+      line_buf *dst_line, const ui32 dst_line_offset,
+      si64 shift, ui32 width);
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_irv_convert_to_integer_nlt_type3(
+      const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width);
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_irv_convert_to_float_nlt_type3(
+      const line_buf *src_line, ui32 src_line_offset,
+      line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width);
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_rct_forward(
+      const line_buf *r, const line_buf *g, const line_buf *b,
+      line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat);
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_rct_backward(
+      const line_buf *y, const line_buf *cb, const line_buf *cr,
+      line_buf *r, line_buf *g, line_buf *b, ui32 repeat);
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_ict_forward(const float *r, const float *g, const float *b,
+                          float *y, float *cb, float *cr, ui32 repeat);
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_ict_backward(const float *y, const float *cb, const float *cr,
+                           float *r, float *g, float *b, ui32 repeat);
+
   }
 }
 
diff --git a/src/core/transform/ojph_colour_vsx.cpp b/src/core/transform/ojph_colour_vsx.cpp
new file mode 100644
index 00000000..ef358245
--- /dev/null
+++ b/src/core/transform/ojph_colour_vsx.cpp
@@ -0,0 +1,668 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2021, Aous Naman
+// Copyright (c) 2021, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2021, The University of New South Wales, Australia
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: ojph_colour_vsx.cpp
+// Author: Aous Naman
+// Date: 9 February 2021
+//***************************************************************************/
+
+#include <climits>
+#include <cmath>
+#include "ojph_simd_vsx.h"
+
+#include "ojph_defs.h"
+#include "ojph_mem.h"
+#include "ojph_colour.h"
+#include "ojph_colour_local.h"
+
+namespace ojph {
+  namespace local {
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    v128_t ojph_convert_float_to_i32(v128_t a)
+    { // We implement ojph_round, which is
+      // val + (val >= 0.0f ? 0.5f : -0.5f), where val is float; this is
+      // round to nearest with ties away from zero, which is exactly what
+      // xvrspi does.  The instruction is used via inline asm because
+      // GCC's vec_round rounds ties to even.
+      vsx_v_f32 w;
+      __asm__("xvrspi %x0,%x1" : "=wa"(w) : "wa"((vsx_v_f32)a));
+      return (v128_t)vec_cts(w, 0);     // saturating convert to int32
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_rev_convert(const line_buf *src_line,
+                          const ui32 src_line_offset,
+                          line_buf *dst_line,
+                          const ui32 dst_line_offset,
+                          si64 shift, ui32 width)
+    {
+      if (src_line->flags & line_buf::LFT_32BIT)
+      {
+        if (dst_line->flags & line_buf::LFT_32BIT)
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si32 *dp = dst_line->i32 + dst_line_offset;
+          v128_t sh = vsx_i32x4_splat((si32)shift);
+          for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+          {
+            v128_t s = vsx_v128_load(sp);
+            s = vsx_i32x4_add(s, sh);
+            vsx_v128_store(dp, s);
+          }
+        }
+        else
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si64 *dp = dst_line->i64 + dst_line_offset;
+          v128_t sh = vsx_i64x2_splat(shift);
+          for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+          {
+            v128_t s, t;
+            s = vsx_v128_load(sp);
+
+            t = vsx_i64x2_extend_low_i32x4(s);
+            t = vsx_i64x2_add(t, sh);
+            vsx_v128_store(dp, t);
+
+            t = vsx_i64x2_extend_high_i32x4(s);
+            t = vsx_i64x2_add(t, sh);
+            vsx_v128_store(dp + 2, t);
+          }
+        }
+      }
+      else
+      {
+        assert(src_line->flags | line_buf::LFT_64BIT);
+        assert(dst_line->flags | line_buf::LFT_32BIT);
+        const si64 *sp = src_line->i64 + src_line_offset;
+        si32 *dp = dst_line->i32 + dst_line_offset;
+        v128_t sh = vsx_i64x2_splat(shift);
+        for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+        {
+          v128_t s0, s1;
+          s0 = vsx_v128_load(sp);
+          s0 = vsx_i64x2_add(s0, sh);
+          s1 = vsx_v128_load(sp + 2);
+          s1 = vsx_i64x2_add(s1, sh);
+          s0 = vsx_i32x4_shuffle(s0, s1, 0, 2, 4 + 0, 4 + 2);
+          vsx_v128_store(dp, s0);
+        }
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_rev_convert_nlt_type3(const line_buf *src_line,
+                                    const ui32 src_line_offset,
+                                    line_buf *dst_line,
+                                    const ui32 dst_line_offset,
+                                    si64 shift, ui32 width)
+    {
+      if (src_line->flags & line_buf::LFT_32BIT)
+      {
+        if (dst_line->flags & line_buf::LFT_32BIT)
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si32 *dp = dst_line->i32 + dst_line_offset;
+          v128_t sh = vsx_i32x4_splat((si32)(-shift));
+          v128_t zero = vsx_i32x4_splat(0);
+          for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
+          {
+            v128_t s = vsx_v128_load(sp);
+            v128_t c = vsx_i32x4_lt(s, zero);     // 0xFFFFFFFF for -ve value
+            v128_t v_m_sh = vsx_i32x4_sub(sh, s); // - shift - value
+            v_m_sh = vsx_v128_and(c, v_m_sh);     // keep only - shift - value
+            s = vsx_v128_andnot(s, c);            // keep only +ve or 0
+            s = vsx_v128_or(s, v_m_sh);           // combine
+            vsx_v128_store(dp, s);
+          }
+        }
+        else
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si64 *dp = dst_line->i64 + dst_line_offset;
+          v128_t sh = vsx_i64x2_splat(-shift);
+          v128_t zero = vsx_i32x4_splat(0);
+          for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
+          {
+            v128_t s, u, c, v_m_sh;
+            s = vsx_v128_load(sp);
+
+            u = vsx_i64x2_extend_low_i32x4(s);
+            c = vsx_i64x2_lt(u, zero);        // 64b -1 for -ve value
+            v_m_sh = vsx_i64x2_sub(sh, u);    // - shift - value
+            v_m_sh = vsx_v128_and(c, v_m_sh); // keep only - shift - value
+            u = vsx_v128_andnot(u, c);        // keep only +ve or 0
+            u = vsx_v128_or(u, v_m_sh);       // combine
+
+            vsx_v128_store(dp, u);
+
+            u = vsx_i64x2_extend_high_i32x4(s);
+            c = vsx_i64x2_lt(u, zero);        // 64b -1 for -ve value
+            v_m_sh = vsx_i64x2_sub(sh, u);    // - shift - value
+            v_m_sh = vsx_v128_and(c, v_m_sh); // keep only - shift - value
+            u = vsx_v128_andnot(u, c);        // keep only +ve or 0
+            u = vsx_v128_or(u, v_m_sh);       // combine
+
+            vsx_v128_store(dp + 2, u);
+          }
+        }
+      }
+      else
+      {
+        assert(src_line->flags | line_buf::LFT_64BIT);
+        assert(dst_line->flags | line_buf::LFT_32BIT);
+        const si64 *sp = src_line->i64 + src_line_offset;
+        si32 *dp = dst_line->i32 + dst_line_offset;
+        v128_t sh = vsx_i64x2_splat(-shift);
+        v128_t zero = vsx_i32x4_splat(0);
+        for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
+        {
+          // s for source, t for target, p for positive, n for negative,
+          // m for mask, and tm for temp
+          v128_t s, t0, t1, p, n, m, tm;
+          s = vsx_v128_load(sp);
+          m = vsx_i64x2_lt(s, zero);   // 64b -1 for -ve value
+          tm = vsx_i64x2_sub(sh, s);   // - shift - value
+          n = vsx_v128_and(m, tm);     // -ve
+          p = vsx_v128_andnot(s, m);   // +ve
+          t0 = vsx_v128_or(n, p);
+
+          s = vsx_v128_load(sp + 2);
+          m = vsx_i64x2_lt(s, zero);   // 64b -1 for -ve value
+          tm = vsx_i64x2_sub(sh, s);   // - shift - value
+          n = vsx_v128_and(m, tm);     // -ve
+          p = vsx_v128_andnot(s, m);   // +ve
+          t1 = vsx_v128_or(n, p);
+
+          t0 = vsx_i32x4_shuffle(t0, t1, 0, 2, 4 + 0, 4 + 2);
+          vsx_v128_store(dp, t0);
+        }
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul,
+                                        ui32 width)
+    {
+      v128_t shift = vsx_f32x4_splat(0.5f);
+      v128_t m = vsx_f32x4_splat(mul);
+      for (ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+      {
+        v128_t t = vsx_v128_load(sp);
+        v128_t s = vsx_f32x4_convert_i32x4(t);
+        s = vsx_f32x4_mul(s, m);
+        s = vsx_f32x4_sub(s, shift);
+        vsx_v128_store(dp, s);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul,
+                                  ui32 width)
+    {
+      v128_t m = vsx_f32x4_splat(mul);
+      for (ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+      {
+        v128_t t = vsx_v128_load(sp);
+        v128_t s = vsx_f32x4_convert_i32x4(t);
+        s = vsx_f32x4_mul(s, m);
+        vsx_v128_store(dp, s);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
+                                        ui32 width)
+    {
+      const v128_t half = vsx_f32x4_splat(0.5f);
+      v128_t m = vsx_f32x4_splat(mul);
+      for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+      {
+        v128_t t = vsx_v128_load(sp);
+        v128_t s = vsx_f32x4_add(t, half);
+        s = vsx_f32x4_mul(s, m);
+        s = vsx_f32x4_add(s, half); // + 0.5 and followed by floor next
+        vsx_v128_store(dp, ojph_convert_float_to_i32(s));
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
+                                  ui32 width)
+    {
+      const v128_t half = vsx_f32x4_splat(0.5f);
+      v128_t m = vsx_f32x4_splat(mul);
+      for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+      {
+        v128_t t = vsx_v128_load(sp);
+        v128_t s = vsx_f32x4_mul(t, m);
+        s = vsx_f32x4_add(s, half); // + 0.5 and followed by floor next
+        vsx_v128_store(dp, ojph_convert_float_to_i32(s));
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    v128_t ojph_vsx_i32x4_max_ge(v128_t a, v128_t b, v128_t x, v128_t y)
+    {
+      v128_t c = vsx_f32x4_ge(x, y);    // 0xFFFFFFFF for x >= y
+      return (v128_t)vec_sel((vsx_v_u32)b, (vsx_v_u32)a, (vsx_v_u32)c);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    v128_t ojph_vsx_i32x4_min_lt(v128_t a, v128_t b, v128_t x, v128_t y)
+    {
+      v128_t c = vsx_f32x4_lt(x, y);    // 0xFFFFFFFF for x < y
+      return (v128_t)vec_sel((vsx_v_u32)b, (vsx_v_u32)a, (vsx_v_u32)c);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    template <bool NLT_TYPE3>
+    static inline
+    void local_vsx_irv_convert_to_integer(const line_buf *src_line,
+      line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      assert((src_line->flags & line_buf::LFT_32BIT) &&
+             (src_line->flags & line_buf::LFT_INTEGER) == 0 &&
+             (dst_line->flags & line_buf::LFT_32BIT) &&
+             (dst_line->flags & line_buf::LFT_INTEGER));
+
+      assert(bit_depth <= 32);
+      const float* sp = src_line->f32;
+      si32* dp = dst_line->i32 + dst_line_offset;
+      // There is the possibility that converting to integer will
+      // exceed the dynamic range of 32bit integer; therefore, care must be
+      // exercised.
+      // We look if the floating point number is outside the half-closed
+      // interval [-0.5f, 0.5f). If so, we limit the resulting integer
+      // to the maximum/minimum that number supports.
+      si32 neg_limit = (si32)INT_MIN >> (32 - bit_depth);
+      v128_t mul = vsx_f32x4_splat((float)(1ull << bit_depth));
+      v128_t fl_up_lim = vsx_f32x4_splat(-(float)neg_limit); // val < upper
+      v128_t fl_low_lim = vsx_f32x4_splat((float)neg_limit); // val >= lower
+      v128_t s32_up_lim = vsx_i32x4_splat(INT_MAX >> (32 - bit_depth));
+      v128_t s32_low_lim = vsx_i32x4_splat(INT_MIN >> (32 - bit_depth));
+
+      if (is_signed)
+      {
+        const v128_t zero = vsx_f32x4_splat(0.0f);
+        v128_t bias = vsx_i32x4_splat(-(si32)((1ULL << (bit_depth - 1)) + 1));
+        for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
+          v128_t t = vsx_v128_load(sp);
+          t = vsx_f32x4_mul(t, mul);
+          v128_t u = ojph_convert_float_to_i32(t);
+          u = ojph_vsx_i32x4_max_ge(u, s32_low_lim, t, fl_low_lim);
+          u = ojph_vsx_i32x4_min_lt(u, s32_up_lim, t, fl_up_lim);
+          if (NLT_TYPE3)
+          {
+            v128_t c = vsx_i32x4_gt(zero, u);    // 0xFFFFFFFF for -ve value
+            v128_t neg = vsx_i32x4_sub(bias, u); // -bias -value
+            neg = vsx_v128_and(c, neg);          // keep only - bias - value
+            u = vsx_v128_andnot(u, c);           // keep only +ve or 0
+            u = vsx_v128_or(neg, u);             // combine
+          }
+          vsx_v128_store(dp, u);
+        }
+      }
+      else
+      {
+        v128_t ihalf = vsx_i32x4_splat((si32)(1ULL << (bit_depth - 1)));
+        for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
+          v128_t t = vsx_v128_load(sp);
+          t = vsx_f32x4_mul(t, mul);
+          v128_t u = ojph_convert_float_to_i32(t);
+          u = ojph_vsx_i32x4_max_ge(u, s32_low_lim, t, fl_low_lim);
+          u = ojph_vsx_i32x4_min_lt(u, s32_up_lim, t, fl_up_lim);
+          u = vsx_i32x4_add(u, ihalf);
+          vsx_v128_store(dp, u);
+        }
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_irv_convert_to_integer(const line_buf *src_line,
+      line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_vsx_irv_convert_to_integer<false>(src_line, dst_line, 
+        dst_line_offset, bit_depth, is_signed, width);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_irv_convert_to_integer_nlt_type3(const line_buf *src_line,
+      line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_vsx_irv_convert_to_integer<true>(src_line, dst_line, 
+        dst_line_offset, bit_depth, is_signed, width);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    template <bool NLT_TYPE3>
+    static inline
+    void local_vsx_irv_convert_to_float(const line_buf *src_line,
+      ui32 src_line_offset, line_buf *dst_line,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      assert((src_line->flags & line_buf::LFT_32BIT) &&
+             (src_line->flags & line_buf::LFT_INTEGER) &&
+             (dst_line->flags & line_buf::LFT_32BIT) &&
+             (dst_line->flags & line_buf::LFT_INTEGER) == 0);
+
+      assert(bit_depth <= 32);
+      v128_t mul = vsx_f32x4_splat((float)(1.0 / (double)(1ULL << bit_depth)));
+
+      const si32* sp = src_line->i32 + src_line_offset;
+      float* dp = dst_line->f32;
+      if (is_signed)
+      {
+        v128_t zero = vsx_i32x4_splat(0);
+        v128_t bias = vsx_i32x4_splat(-(si32)((1ULL << (bit_depth - 1)) + 1));
+        for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
+          v128_t t = vsx_v128_load(sp);
+          if (NLT_TYPE3)
+          {
+            v128_t c = vsx_i32x4_lt(t, zero);    // 0xFFFFFFFF for -ve value
+            v128_t neg = vsx_i32x4_sub(bias, t); // - bias - value
+            neg = vsx_v128_and(c, neg);          // keep only - bias - value
+            c = vsx_v128_andnot(t, c);           // keep only +ve or 0
+            t = vsx_v128_or(neg, c);             // combine
+          }
+          v128_t v = vsx_f32x4_convert_i32x4(t);
+          v = vsx_f32x4_mul(v, mul);
+          vsx_v128_store(dp, v);
+        }
+      }
+      else
+      {
+        v128_t half = vsx_i32x4_splat((si32)(1ULL << (bit_depth - 1)));
+        for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
+          v128_t t = vsx_v128_load(sp);
+          t = vsx_i32x4_sub(t, half);
+          v128_t v = vsx_f32x4_convert_i32x4(t);
+          v = vsx_f32x4_mul(v, mul);
+          vsx_v128_store(dp, v);
+        }
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_irv_convert_to_float(const line_buf *src_line,
+      ui32 src_line_offset, line_buf *dst_line,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_vsx_irv_convert_to_float<false>(src_line, src_line_offset,
+        dst_line, bit_depth, is_signed, width);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_irv_convert_to_float_nlt_type3(const line_buf *src_line,
+      ui32 src_line_offset, line_buf *dst_line,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_vsx_irv_convert_to_float<true>(src_line, src_line_offset,
+        dst_line, bit_depth, is_signed, width);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_rct_forward(const line_buf *r,
+                          const line_buf *g,
+                          const line_buf *b,
+                          line_buf *y, line_buf *cb, line_buf *cr,
+                          ui32 repeat)
+    {
+      assert((y->flags  & line_buf::LFT_INTEGER) &&
+             (cb->flags & line_buf::LFT_INTEGER) &&
+             (cr->flags & line_buf::LFT_INTEGER) &&
+             (r->flags  & line_buf::LFT_INTEGER) &&
+             (g->flags  & line_buf::LFT_INTEGER) &&
+             (b->flags  & line_buf::LFT_INTEGER));
+
+      if  (y->flags & line_buf::LFT_32BIT)
+      {
+        assert((y->flags  & line_buf::LFT_32BIT) &&
+               (cb->flags & line_buf::LFT_32BIT) &&
+               (cr->flags & line_buf::LFT_32BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) &&
+               (b->flags  & line_buf::LFT_32BIT));
+        const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32;
+        si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32;
+
+        for (int i = (repeat + 3) >> 2; i > 0; --i)
+        {
+          v128_t mr = vsx_v128_load(rp);
+          v128_t mg = vsx_v128_load(gp);
+          v128_t mb = vsx_v128_load(bp);
+          v128_t t = vsx_i32x4_add(mr, mb);
+          t = vsx_i32x4_add(t, vsx_i32x4_shl(mg, 1));
+          vsx_v128_store(yp, vsx_i32x4_shr(t, 2));
+          t = vsx_i32x4_sub(mb, mg);
+          vsx_v128_store(cbp, t);
+          t = vsx_i32x4_sub(mr, mg);
+          vsx_v128_store(crp, t);
+
+            rp += 4; gp += 4; bp += 4;
+            yp += 4; cbp += 4; crp += 4;
+        }
+      }
+      else
+      {
+        assert((y->flags  & line_buf::LFT_64BIT) &&
+               (cb->flags & line_buf::LFT_64BIT) &&
+               (cr->flags & line_buf::LFT_64BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) &&
+               (b->flags  & line_buf::LFT_32BIT));
+        const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
+        for (int i = (repeat + 3) >> 2; i > 0; --i)
+        {
+          v128_t mr32 = vsx_v128_load(rp);
+          v128_t mg32 = vsx_v128_load(gp);
+          v128_t mb32 = vsx_v128_load(bp);
+          v128_t mr, mg, mb, t;
+          mr = vsx_i64x2_extend_low_i32x4(mr32);
+          mg = vsx_i64x2_extend_low_i32x4(mg32);
+          mb = vsx_i64x2_extend_low_i32x4(mb32);
+
+          t = vsx_i64x2_add(mr, mb);
+          t = vsx_i64x2_add(t, vsx_i64x2_shl(mg, 1));
+          vsx_v128_store(yp, vsx_i64x2_shr(t, 2));
+          t = vsx_i64x2_sub(mb, mg);
+          vsx_v128_store(cbp, t);
+          t = vsx_i64x2_sub(mr, mg);
+          vsx_v128_store(crp, t);
+
+          yp += 2; cbp += 2; crp += 2;
+
+          mr = vsx_i64x2_extend_high_i32x4(mr32);
+          mg = vsx_i64x2_extend_high_i32x4(mg32);
+          mb = vsx_i64x2_extend_high_i32x4(mb32);
+
+          t = vsx_i64x2_add(mr, mb);
+          t = vsx_i64x2_add(t, vsx_i64x2_shl(mg, 1));
+          vsx_v128_store(yp, vsx_i64x2_shr(t, 2));
+          t = vsx_i64x2_sub(mb, mg);
+          vsx_v128_store(cbp, t);
+          t = vsx_i64x2_sub(mr, mg);
+          vsx_v128_store(crp, t);
+
+          rp += 4; gp += 4; bp += 4;
+          yp += 2; cbp += 2; crp += 2;
+        }
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_rct_backward(const line_buf *y,
+                           const line_buf *cb,
+                           const line_buf *cr,
+                           line_buf *r, line_buf *g, line_buf *b,
+                           ui32 repeat)
+    {
+      assert((y->flags  & line_buf::LFT_INTEGER) &&
+             (cb->flags & line_buf::LFT_INTEGER) &&
+             (cr->flags & line_buf::LFT_INTEGER) &&
+             (r->flags  & line_buf::LFT_INTEGER) &&
+             (g->flags  & line_buf::LFT_INTEGER) &&
+             (b->flags  & line_buf::LFT_INTEGER));
+
+      if (y->flags & line_buf::LFT_32BIT)
+      {
+        assert((y->flags  & line_buf::LFT_32BIT) &&
+               (cb->flags & line_buf::LFT_32BIT) &&
+               (cr->flags & line_buf::LFT_32BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) &&
+               (b->flags  & line_buf::LFT_32BIT));
+        const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32;
+        si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        for (int i = (repeat + 3) >> 2; i > 0; --i)
+        {
+          v128_t my  = vsx_v128_load(yp);
+          v128_t mcb = vsx_v128_load(cbp);
+          v128_t mcr = vsx_v128_load(crp);
+
+          v128_t t = vsx_i32x4_add(mcb, mcr);
+          t = vsx_i32x4_sub(my, vsx_i32x4_shr(t, 2));
+          vsx_v128_store(gp, t);
+          v128_t u = vsx_i32x4_add(mcb, t);
+          vsx_v128_store(bp, u);
+          u = vsx_i32x4_add(mcr, t);
+          vsx_v128_store(rp, u);
+
+          yp += 4; cbp += 4; crp += 4;
+          rp += 4; gp += 4; bp += 4;
+        }
+      }
+      else
+      {
+        assert((y->flags  & line_buf::LFT_64BIT) &&
+               (cb->flags & line_buf::LFT_64BIT) &&
+               (cr->flags & line_buf::LFT_64BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) &&
+               (b->flags  & line_buf::LFT_32BIT));
+        const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
+        si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        for (int i = (repeat + 3) >> 2; i > 0; --i)
+        {
+          v128_t my, mcb, mcr, tr0, tg0, tb0, tr1, tg1, tb1;
+          my  = vsx_v128_load(yp);
+          mcb = vsx_v128_load(cbp);
+          mcr = vsx_v128_load(crp);
+
+          tg0 = vsx_i64x2_add(mcb, mcr);
+          tg0 = vsx_i64x2_sub(my, vsx_i64x2_shr(tg0, 2));
+          tb0 = vsx_i64x2_add(mcb, tg0);
+          tr0 = vsx_i64x2_add(mcr, tg0);
+
+          yp += 2; cbp += 2; crp += 2;
+
+          my  = vsx_v128_load(yp);
+          mcb = vsx_v128_load(cbp);
+          mcr = vsx_v128_load(crp);
+
+          tg1 = vsx_i64x2_add(mcb, mcr);
+          tg1 = vsx_i64x2_sub(my, vsx_i64x2_shr(tg1, 2));
+          tb1 = vsx_i64x2_add(mcb, tg1);
+          tr1 = vsx_i64x2_add(mcr, tg1);
+
+          tr0 = vsx_i32x4_shuffle(tr0, tr1, 0, 2, 4 + 0, 4 + 2);
+          tg0 = vsx_i32x4_shuffle(tg0, tg1, 0, 2, 4 + 0, 4 + 2);
+          tb0 = vsx_i32x4_shuffle(tb0, tb1, 0, 2, 4 + 0, 4 + 2);
+
+          vsx_v128_store(rp, tr0);
+          vsx_v128_store(gp, tg0);
+          vsx_v128_store(bp, tb0);
+
+          yp += 2; cbp += 2; crp += 2;
+          rp += 4; gp += 4; bp += 4;
+        }
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_ict_forward(const float *r, const float *g, const float *b,
+                          float *y, float *cb, float *cr, ui32 repeat)
+    {
+      v128_t alpha_rf = vsx_f32x4_splat(CT_CNST::ALPHA_RF);
+      v128_t alpha_gf = vsx_f32x4_splat(CT_CNST::ALPHA_GF);
+      v128_t alpha_bf = vsx_f32x4_splat(CT_CNST::ALPHA_BF);
+      v128_t beta_cbf = vsx_f32x4_splat(CT_CNST::BETA_CbF);
+      v128_t beta_crf = vsx_f32x4_splat(CT_CNST::BETA_CrF);
+      for (ui32 i = (repeat + 3) >> 2; i > 0; --i)
+      {
+        v128_t mr = vsx_v128_load(r);
+        v128_t mb = vsx_v128_load(b);
+        v128_t my = vsx_f32x4_mul(alpha_rf, mr);
+        my = vsx_f32x4_add(my, vsx_f32x4_mul(alpha_gf, vsx_v128_load(g)));
+        my = vsx_f32x4_add(my, vsx_f32x4_mul(alpha_bf, mb));
+        vsx_v128_store(y, my);
+        vsx_v128_store(cb, vsx_f32x4_mul(beta_cbf, vsx_f32x4_sub(mb, my)));
+        vsx_v128_store(cr, vsx_f32x4_mul(beta_crf, vsx_f32x4_sub(mr, my)));
+
+        r += 4; g += 4; b += 4;
+        y += 4; cb += 4; cr += 4;
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_ict_backward(const float *y, const float *cb, const float *cr,
+                           float *r, float *g, float *b, ui32 repeat)
+    {
+      v128_t gamma_cr2g = vsx_f32x4_splat(CT_CNST::GAMMA_CR2G);
+      v128_t gamma_cb2g = vsx_f32x4_splat(CT_CNST::GAMMA_CB2G);
+      v128_t gamma_cr2r = vsx_f32x4_splat(CT_CNST::GAMMA_CR2R);
+      v128_t gamma_cb2b = vsx_f32x4_splat(CT_CNST::GAMMA_CB2B);
+      for (ui32 i = (repeat + 3) >> 2; i > 0; --i)
+      {
+        v128_t my = vsx_v128_load(y);
+        v128_t mcr = vsx_v128_load(cr);
+        v128_t mcb = vsx_v128_load(cb);
+        v128_t mg = vsx_f32x4_sub(my, vsx_f32x4_mul(gamma_cr2g, mcr));
+        vsx_v128_store(g, vsx_f32x4_sub(mg, vsx_f32x4_mul(gamma_cb2g, mcb)));
+        vsx_v128_store(r, vsx_f32x4_add(my, vsx_f32x4_mul(gamma_cr2r, mcr)));
+        vsx_v128_store(b, vsx_f32x4_add(my, vsx_f32x4_mul(gamma_cb2b, mcb)));
+
+        y += 4; cb += 4; cr += 4;
+        r += 4; g += 4; b += 4;
+      }
+    }
+
+  }
+}
diff --git a/src/core/transform/ojph_transform.cpp b/src/core/transform/ojph_transform.cpp
index f67ea1b6..e587b13e 100644
--- a/src/core/transform/ojph_transform.cpp
+++ b/src/core/transform/ojph_transform.cpp
@@ -168,6 +168,21 @@ namespace ojph {
     
     #elif defined(OJPH_ARCH_ARM)
 
+    #elif defined(OJPH_ARCH_PPC64LE)
+
+        if (get_cpu_ext_level() >= PPC_CPU_EXT_LEVEL_ARCH_3_00)
+        {
+          // 128-bit VSX kernels; see ojph_simd_vsx.h
+          rev_vert_step             = vsx_rev_vert_step;
+          rev_horz_ana              = vsx_rev_horz_ana;
+          rev_horz_syn              = vsx_rev_horz_syn;
+
+          irv_vert_step             = vsx_irv_vert_step;
+          irv_vert_times_K          = vsx_irv_vert_times_K;
+          irv_horz_ana              = vsx_irv_horz_ana;
+          irv_horz_syn              = vsx_irv_horz_syn;
+        }
+
     #endif // !(defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386))
 
   #endif // !OJPH_DISABLE_SIMD
diff --git a/src/core/transform/ojph_transform_local.h b/src/core/transform/ojph_transform_local.h
index acf9ee6d..4bc0c143 100644
--- a/src/core/transform/ojph_transform_local.h
+++ b/src/core/transform/ojph_transform_local.h
@@ -2,21 +2,21 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Aous Naman
 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
 // Copyright (c) 2019, The University of New South Wales, Australia
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
-// 
+//
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -63,21 +63,21 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     /////////////////////////////////////////////////////////////////////////
-    void gen_irv_vert_step(const lifting_step* s, const line_buf* sig, 
-                           const line_buf* other, const line_buf* aug, 
+    void gen_irv_vert_step(const lifting_step* s, const line_buf* sig,
+                           const line_buf* other, const line_buf* aug,
                            ui32 repeat, bool synthesis);
 
     /////////////////////////////////////////////////////////////////////////
     void gen_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat);
 
     /////////////////////////////////////////////////////////////////////////
-    void gen_irv_horz_ana(const param_atk* atk, const line_buf* ldst, 
-                          const line_buf* hdst, const line_buf* src, 
+    void gen_irv_horz_ana(const param_atk* atk, const line_buf* ldst,
+                          const line_buf* hdst, const line_buf* src,
                           ui32 width, bool even);
 
     /////////////////////////////////////////////////////////////////////////
-    void gen_irv_horz_syn(const param_atk *atk, const line_buf* dst, 
-                          const line_buf *lsrc, const line_buf *hsrc, 
+    void gen_irv_horz_syn(const param_atk *atk, const line_buf* dst,
+                          const line_buf *lsrc, const line_buf *hsrc,
                           ui32 width, bool even);
 
     //////////////////////////////////////////////////////////////////////////
@@ -85,18 +85,18 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     /////////////////////////////////////////////////////////////////////////
-    void gen_rev_vert_step(const lifting_step* s, const line_buf* sig, 
-                           const line_buf* other, const line_buf* aug, 
+    void gen_rev_vert_step(const lifting_step* s, const line_buf* sig,
+                           const line_buf* other, const line_buf* aug,
                            ui32 repeat, bool synthesis);
 
     /////////////////////////////////////////////////////////////////////////
-    void gen_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
-                          const line_buf* hdst, const line_buf* src, 
+    void gen_rev_horz_ana(const param_atk* atk, const line_buf* ldst,
+                          const line_buf* hdst, const line_buf* src,
                           ui32 width, bool even);
 
     /////////////////////////////////////////////////////////////////////////
-    void gen_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
-                          const line_buf* lsrc, const line_buf* hsrc, 
+    void gen_rev_horz_syn(const param_atk* atk, const line_buf* dst,
+                          const line_buf* lsrc, const line_buf* hsrc,
                           ui32 width, bool even);
 
     //////////////////////////////////////////////////////////////////////////
@@ -112,8 +112,8 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     /////////////////////////////////////////////////////////////////////////
-    void sse_irv_vert_step(const lifting_step* s, const line_buf* sig, 
-                           const line_buf* other, const line_buf* aug, 
+    void sse_irv_vert_step(const lifting_step* s, const line_buf* sig,
+                           const line_buf* other, const line_buf* aug,
                            ui32 repeat, bool synthesis);
 
     /////////////////////////////////////////////////////////////////////////
@@ -121,12 +121,12 @@ namespace ojph {
 
     /////////////////////////////////////////////////////////////////////////
     void sse_irv_horz_ana(const param_atk* atk, const line_buf* ldst,
-                          const line_buf* hdst, const line_buf* src, 
+                          const line_buf* hdst, const line_buf* src,
                           ui32 width, bool even);
 
     /////////////////////////////////////////////////////////////////////////
     void sse_irv_horz_syn(const param_atk *atk, const line_buf* dst,
-                          const line_buf *lsrc, const line_buf *hsrc, 
+                          const line_buf *lsrc, const line_buf *hsrc,
                           ui32 width, bool even);
 
     //////////////////////////////////////////////////////////////////////////
@@ -142,18 +142,18 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     /////////////////////////////////////////////////////////////////////////
-    void sse2_rev_vert_step(const lifting_step* s, const line_buf* sig, 
-                            const line_buf* other, const line_buf* aug, 
+    void sse2_rev_vert_step(const lifting_step* s, const line_buf* sig,
+                            const line_buf* other, const line_buf* aug,
                             ui32 repeat, bool synthesis);
 
     /////////////////////////////////////////////////////////////////////////
     void sse2_rev_horz_ana(const param_atk* atk, const line_buf* ldst,
-                           const line_buf* hdst, const line_buf* src, 
+                           const line_buf* hdst, const line_buf* src,
                            ui32 width, bool even);
 
     /////////////////////////////////////////////////////////////////////////
     void sse2_rev_horz_syn(const param_atk* atk, const line_buf* dst,
-                           const line_buf* lsrc, const line_buf* hsrc, 
+                           const line_buf* lsrc, const line_buf* hsrc,
                            ui32 width, bool even);
 
 
@@ -170,8 +170,8 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     /////////////////////////////////////////////////////////////////////////
-    void avx_irv_vert_step(const lifting_step* s, const line_buf* sig, 
-                           const line_buf* other, const line_buf* aug, 
+    void avx_irv_vert_step(const lifting_step* s, const line_buf* sig,
+                           const line_buf* other, const line_buf* aug,
                            ui32 repeat, bool synthesis);
 
     /////////////////////////////////////////////////////////////////////////
@@ -179,12 +179,12 @@ namespace ojph {
 
     /////////////////////////////////////////////////////////////////////////
     void avx_irv_horz_ana(const param_atk* atk, const line_buf* ldst,
-                          const line_buf* hdst, const line_buf* src, 
+                          const line_buf* hdst, const line_buf* src,
                           ui32 width, bool even);
 
     /////////////////////////////////////////////////////////////////////////
     void avx_irv_horz_syn(const param_atk *atk, const line_buf* dst,
-                          const line_buf *lsrc, const line_buf *hsrc, 
+                          const line_buf *lsrc, const line_buf *hsrc,
                           ui32 width, bool even);
 
     //////////////////////////////////////////////////////////////////////////
@@ -200,18 +200,18 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     /////////////////////////////////////////////////////////////////////////
-    void avx2_rev_vert_step(const lifting_step* s, const line_buf* sig, 
-                            const line_buf* other, const line_buf* aug, 
+    void avx2_rev_vert_step(const lifting_step* s, const line_buf* sig,
+                            const line_buf* other, const line_buf* aug,
                             ui32 repeat, bool synthesis);
 
     /////////////////////////////////////////////////////////////////////////
     void avx2_rev_horz_ana(const param_atk* atk, const line_buf* ldst,
-                           const line_buf* hdst, const line_buf* src, 
+                           const line_buf* hdst, const line_buf* src,
                            ui32 width, bool even);
 
     /////////////////////////////////////////////////////////////////////////
     void avx2_rev_horz_syn(const param_atk* atk, const line_buf* dst,
-                           const line_buf* lsrc, const line_buf* hsrc, 
+                           const line_buf* lsrc, const line_buf* hsrc,
                            ui32 width, bool even);
 
     //////////////////////////////////////////////////////////////////////////
@@ -227,8 +227,8 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     /////////////////////////////////////////////////////////////////////////
-    void avx512_irv_vert_step(const lifting_step* s, const line_buf* sig, 
-                              const line_buf* other, const line_buf* aug, 
+    void avx512_irv_vert_step(const lifting_step* s, const line_buf* sig,
+                              const line_buf* other, const line_buf* aug,
                               ui32 repeat, bool synthesis);
 
     /////////////////////////////////////////////////////////////////////////
@@ -236,12 +236,12 @@ namespace ojph {
 
     /////////////////////////////////////////////////////////////////////////
     void avx512_irv_horz_ana(const param_atk* atk, const line_buf* ldst,
-                             const line_buf* hdst, const line_buf* src, 
+                             const line_buf* hdst, const line_buf* src,
                              ui32 width, bool even);
 
     /////////////////////////////////////////////////////////////////////////
     void avx512_irv_horz_syn(const param_atk *atk, const line_buf* dst,
-                             const line_buf *lsrc, const line_buf *hsrc, 
+                             const line_buf *lsrc, const line_buf *hsrc,
                              ui32 width, bool even);
 
 
@@ -251,17 +251,17 @@ namespace ojph {
 
     /////////////////////////////////////////////////////////////////////////
     void avx512_rev_vert_step(const lifting_step* s, const line_buf* sig,
-                              const line_buf* other, const line_buf* aug, 
+                              const line_buf* other, const line_buf* aug,
                               ui32 repeat, bool synthesis);
 
     /////////////////////////////////////////////////////////////////////////
     void avx512_rev_horz_ana(const param_atk* atk, const line_buf* ldst,
-                             const line_buf* hdst, const line_buf* src, 
+                             const line_buf* hdst, const line_buf* src,
                              ui32 width, bool even);
 
     /////////////////////////////////////////////////////////////////////////
     void avx512_rev_horz_syn(const param_atk* atk, const line_buf* dst,
-                             const line_buf* lsrc, const line_buf* hsrc, 
+                             const line_buf* lsrc, const line_buf* hsrc,
                              ui32 width, bool even);
 
     //////////////////////////////////////////////////////////////////////////
@@ -277,8 +277,8 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     /////////////////////////////////////////////////////////////////////////
-    void wasm_irv_vert_step(const lifting_step* s, const line_buf* sig, 
-                            const line_buf* other, const line_buf* aug, 
+    void wasm_irv_vert_step(const lifting_step* s, const line_buf* sig,
+                            const line_buf* other, const line_buf* aug,
                             ui32 repeat, bool synthesis);
 
     /////////////////////////////////////////////////////////////////////////
@@ -286,12 +286,12 @@ namespace ojph {
 
     /////////////////////////////////////////////////////////////////////////
     void wasm_irv_horz_ana(const param_atk* atk, const line_buf* ldst,
-                           const line_buf* hdst, const line_buf* src, 
+                           const line_buf* hdst, const line_buf* src,
                            ui32 width, bool even);
 
     /////////////////////////////////////////////////////////////////////////
     void wasm_irv_horz_syn(const param_atk *atk, const line_buf* dst,
-                           const line_buf *lsrc, const line_buf *hsrc, 
+                           const line_buf *lsrc, const line_buf *hsrc,
                            ui32 width, bool even);
 
     //////////////////////////////////////////////////////////////////////////
@@ -300,17 +300,65 @@ namespace ojph {
 
     /////////////////////////////////////////////////////////////////////////
     void wasm_rev_vert_step(const lifting_step* s, const line_buf* sig,
-                            const line_buf* other, const line_buf* aug, 
+                            const line_buf* other, const line_buf* aug,
                             ui32 repeat, bool synthesis);
 
     /////////////////////////////////////////////////////////////////////////
     void wasm_rev_horz_ana(const param_atk* atk, const line_buf* ldst,
-                           const line_buf* hdst, const line_buf* src, 
+                           const line_buf* hdst, const line_buf* src,
                            ui32 width, bool even);
 
     /////////////////////////////////////////////////////////////////////////
     void wasm_rev_horz_syn(const param_atk* atk, const line_buf* dst,
-                           const line_buf* lsrc, const line_buf* hsrc, 
+                           const line_buf* lsrc, const line_buf* hsrc,
+                           ui32 width, bool even);
+    //////////////////////////////////////////////////////////////////////////
+    //
+    //
+    //                           VSX Functions
+    //
+    //
+    //////////////////////////////////////////////////////////////////////////
+
+    //////////////////////////////////////////////////////////////////////////
+    // Irreversible functions
+    //////////////////////////////////////////////////////////////////////////
+
+    /////////////////////////////////////////////////////////////////////////
+    void vsx_irv_vert_step(const lifting_step* s, const line_buf* sig,
+                            const line_buf* other, const line_buf* aug,
+                            ui32 repeat, bool synthesis);
+
+    /////////////////////////////////////////////////////////////////////////
+    void vsx_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat);
+
+    /////////////////////////////////////////////////////////////////////////
+    void vsx_irv_horz_ana(const param_atk* atk, const line_buf* ldst,
+                           const line_buf* hdst, const line_buf* src,
+                           ui32 width, bool even);
+
+    /////////////////////////////////////////////////////////////////////////
+    void vsx_irv_horz_syn(const param_atk *atk, const line_buf* dst,
+                           const line_buf *lsrc, const line_buf *hsrc,
+                           ui32 width, bool even);
+
+    //////////////////////////////////////////////////////////////////////////
+    // Reversible functions
+    //////////////////////////////////////////////////////////////////////////
+
+    /////////////////////////////////////////////////////////////////////////
+    void vsx_rev_vert_step(const lifting_step* s, const line_buf* sig,
+                            const line_buf* other, const line_buf* aug,
+                            ui32 repeat, bool synthesis);
+
+    /////////////////////////////////////////////////////////////////////////
+    void vsx_rev_horz_ana(const param_atk* atk, const line_buf* ldst,
+                           const line_buf* hdst, const line_buf* src,
+                           ui32 width, bool even);
+
+    /////////////////////////////////////////////////////////////////////////
+    void vsx_rev_horz_syn(const param_atk* atk, const line_buf* dst,
+                           const line_buf* lsrc, const line_buf* hsrc,
                            ui32 width, bool even);
   }
 }
diff --git a/src/core/transform/ojph_transform_vsx.cpp b/src/core/transform/ojph_transform_vsx.cpp
new file mode 100644
index 00000000..ba8e4d9b
--- /dev/null
+++ b/src/core/transform/ojph_transform_vsx.cpp
@@ -0,0 +1,1310 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2021, Aous Naman 
+// Copyright (c) 2021, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2021, The University of New South Wales, Australia
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// 
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: ojph_transform_vsx.cpp
+// Author: Aous Naman
+// Date: 09 February 2021
+//***************************************************************************/
+
+#include <cstdio>
+#include "ojph_simd_vsx.h"
+
+#include "ojph_defs.h"
+#include "ojph_arch.h"
+#include "ojph_mem.h"
+#include "ojph_params.h"
+#include "../codestream/ojph_params_local.h"
+
+#include "ojph_transform.h"
+#include "ojph_transform_local.h"
+
+namespace ojph {
+  namespace local {
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    void vsx_deinterleave32(float* dpl, float* dph, float* sp, int width)
+    {
+      for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
+      {
+        v128_t a = vsx_v128_load(sp);
+        v128_t b = vsx_v128_load(sp + 4);
+        v128_t c = vsx_i32x4_shuffle(a, b, 0, 2, 4 + 0, 4 + 2);
+        v128_t d = vsx_i32x4_shuffle(a, b, 1, 3, 4 + 1, 4 + 3);
+        // v128_t c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
+        // v128_t d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
+        vsx_v128_store(dpl, c);
+        vsx_v128_store(dph, d);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    void vsx_interleave32(float* dp, float* spl, float* sph, int width)
+    {
+      for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
+      {
+        v128_t a = vsx_v128_load(spl);
+        v128_t b = vsx_v128_load(sph);
+        v128_t c = vsx_i32x4_shuffle(a, b, 0, 4 + 0, 1, 4 + 1);
+        v128_t d = vsx_i32x4_shuffle(a, b, 2, 4 + 2, 3, 4 + 3);
+        // v128_t c = _mm_unpacklo_ps(a, b);
+        // v128_t d = _mm_unpackhi_ps(a, b);
+        vsx_v128_store(dp, c);
+        vsx_v128_store(dp + 4, d);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline 
+    void vsx_deinterleave64(double* dpl, double* dph, double* sp, int width)
+    {
+      for (; width > 0; width -= 4, sp += 4, dpl += 2, dph += 2)
+      {
+        v128_t a = vsx_v128_load(sp);
+        v128_t b = vsx_v128_load(sp + 2);
+        v128_t c = vsx_i64x2_shuffle(a, b, 0, 2 + 0);
+        v128_t d = vsx_i64x2_shuffle(a, b, 1, 2 + 1);
+        vsx_v128_store(dpl, c);
+        vsx_v128_store(dph, d);
+      }
+    }    
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline 
+    void vsx_interleave64(double* dp, double* spl, double* sph, int width)
+    {
+      for (; width > 0; width -= 4, dp += 4, spl += 2, sph += 2)
+      {
+        v128_t a = vsx_v128_load(spl);
+        v128_t b = vsx_v128_load(sph);
+        v128_t c = vsx_i64x2_shuffle(a, b, 0, 2 + 0);
+        v128_t d = vsx_i64x2_shuffle(a, b, 1, 2 + 1);
+        vsx_v128_store(dp, c);
+        vsx_v128_store(dp + 2, d);
+      }
+    }    
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline void vsx_multiply_const(float* p, float f, int width)
+    {
+      v128_t factor = vsx_f32x4_splat(f);
+      for (; width > 0; width -= 4, p += 4)
+      {
+        v128_t s = vsx_v128_load(p);
+        vsx_v128_store(p, vsx_f32x4_mul(factor, s));
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_irv_vert_step(const lifting_step* s, const line_buf* sig, 
+                            const line_buf* other, const line_buf* aug, 
+                            ui32 repeat, bool synthesis)
+    {
+      float a = s->irv.Aatk;
+      if (synthesis)
+        a = -a;
+
+      v128_t factor = vsx_f32x4_splat(a);
+
+      float* dst = aug->f32;
+      const float* src1 = sig->f32, * src2 = other->f32;
+      int i = (int)repeat;
+      for ( ; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+      {
+        v128_t s1 = vsx_v128_load(src1);
+        v128_t s2 = vsx_v128_load(src2);
+        v128_t d  = vsx_v128_load(dst);
+        d = vsx_f32x4_add(d, vsx_f32x4_mul(factor, vsx_f32x4_add(s1, s2)));
+        vsx_v128_store(dst, d);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat)
+    {
+      vsx_multiply_const(aug->f32, K, (int)repeat);
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void vsx_irv_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                           const line_buf* hdst, const line_buf* src, 
+                           ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        // split src into ldst and hdst
+        {
+          float* dpl = even ? ldst->f32 : hdst->f32;
+          float* dph = even ? hdst->f32 : ldst->f32;
+          float* sp = src->f32;
+          int w = (int)width;
+          vsx_deinterleave32(dpl, dph, sp, w);
+        }        
+
+        // the actual horizontal transform
+        float* hp = hdst->f32, * lp = ldst->f32;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
+        {
+          const lifting_step* s = atk->get_step(j - 1);
+          const float a = s->irv.Aatk;
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const float* sp = lp;
+          float* dp = hp;
+          int i = (int)h_width;
+          v128_t f = vsx_f32x4_splat(a);
+          if (even)
+          {
+            for (; i > 0; i -= 4, sp += 4, dp += 4)
+            {
+              v128_t m = vsx_v128_load(sp);
+              v128_t n = vsx_v128_load(sp + 1);
+              v128_t p = vsx_v128_load(dp);
+              p = vsx_f32x4_add(p, vsx_f32x4_mul(f, vsx_f32x4_add(m, n)));
+              vsx_v128_store(dp, p);
+            }
+          }
+          else
+          {
+            for (; i > 0; i -= 4, sp += 4, dp += 4)
+            {
+              v128_t m = vsx_v128_load(sp);
+              v128_t n = vsx_v128_load(sp - 1);
+              v128_t p = vsx_v128_load(dp);
+              p = vsx_f32x4_add(p, vsx_f32x4_mul(f, vsx_f32x4_add(m, n)));
+              vsx_v128_store(dp, p);
+            }
+          }
+
+          // swap buffers
+          float* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
+        }
+
+        { // multiply by K or 1/K
+          float K = atk->get_K();
+          float K_inv = 1.0f / K;
+          vsx_multiply_const(lp, K_inv, (int)l_width);
+          vsx_multiply_const(hp, K, (int)h_width);
+        }
+      }
+      else {
+        if (even)
+          ldst->f32[0] = src->f32[0];
+        else
+          hdst->f32[0] = src->f32[0] * 2.0f;
+      }
+    }
+    
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_irv_horz_syn(const param_atk* atk, const line_buf* dst, 
+                           const line_buf* lsrc, const line_buf* hsrc, 
+                           ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        bool ev = even;
+        float* oth = hsrc->f32, * aug = lsrc->f32;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+
+        { // multiply by K or 1/K
+          float K = atk->get_K();
+          float K_inv = 1.0f / K;
+          vsx_multiply_const(aug, K, (int)aug_width);
+          vsx_multiply_const(oth, K_inv, (int)oth_width);
+        }
+
+        // the actual horizontal transform
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          const lifting_step* s = atk->get_step(j);
+          const float a = s->irv.Aatk;
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const float* sp = oth;
+          float* dp = aug;
+          int i = (int)aug_width;
+          v128_t f = vsx_f32x4_splat(a);
+          if (ev)
+          {
+            for ( ; i > 0; i -= 4, sp += 4, dp += 4)
+            {
+              v128_t m = vsx_v128_load(sp);
+              v128_t n = vsx_v128_load(sp - 1);
+              v128_t p = vsx_v128_load(dp);
+              p = vsx_f32x4_sub(p, vsx_f32x4_mul(f, vsx_f32x4_add(m, n)));
+              vsx_v128_store(dp, p);
+            }
+          }
+          else
+          {
+            for ( ; i > 0; i -= 4, sp += 4, dp += 4)
+            {
+              v128_t m = vsx_v128_load(sp);
+              v128_t n = vsx_v128_load(sp + 1);
+              v128_t p = vsx_v128_load(dp);
+              p = vsx_f32x4_sub(p, vsx_f32x4_mul(f, vsx_f32x4_add(m, n)));
+              vsx_v128_store(dp, p);
+            }
+          }
+
+          // swap buffers
+          float* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
+        }
+
+        // combine both lsrc and hsrc into dst
+        {
+          float* dp = dst->f32;
+          float* spl = even ? lsrc->f32 : hsrc->f32;
+          float* sph = even ? hsrc->f32 : lsrc->f32;
+          int w = (int)width;
+          vsx_interleave32(dp, spl, sph, w);
+        }        
+      }
+      else {
+        if (even)
+          dst->f32[0] = lsrc->f32[0];
+        else
+          dst->f32[0] = hsrc->f32[0] * 0.5f;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void vsx_rev_vert_step32(const lifting_step* s, const line_buf* sig, 
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis)
+    {
+      const si32 a = s->rev.Aatk;
+      const si32 b = s->rev.Batk;
+      const ui8 e = s->rev.Eatk;
+      v128_t va = vsx_i32x4_splat(a);
+      v128_t vb = vsx_i32x4_splat(b);
+
+      si32* dst = aug->i32;
+      const si32* src1 = sig->i32, * src2 = other->i32;
+      // The general definition of the wavelet in Part 2 is slightly 
+      // different to part 2, although they are mathematically equivalent
+      // here, we identify the simpler form from Part 1 and employ them
+      if (a == 1)
+      { // 5/3 update and any case with a == 1
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = vsx_v128_load((v128_t*)src1);
+            v128_t s2 = vsx_v128_load((v128_t*)src2);
+            v128_t d = vsx_v128_load((v128_t*)dst);
+            v128_t t = vsx_i32x4_add(s1, s2);
+            v128_t v = vsx_i32x4_add(vb, t);
+            v128_t w = vsx_i32x4_shr(v, e);
+            d = vsx_i32x4_sub(d, w);
+            vsx_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = vsx_v128_load((v128_t*)src1);
+            v128_t s2 = vsx_v128_load((v128_t*)src2);
+            v128_t d = vsx_v128_load((v128_t*)dst);
+            v128_t t = vsx_i32x4_add(s1, s2);
+            v128_t v = vsx_i32x4_add(vb, t);
+            v128_t w = vsx_i32x4_shr(v, e);
+            d = vsx_i32x4_add(d, w);
+            vsx_v128_store((v128_t*)dst, d);
+          }
+      }
+      else if (a == -1 && b == 1 && e == 1)
+      { // 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = vsx_v128_load((v128_t*)src1);
+            v128_t s2 = vsx_v128_load((v128_t*)src2);
+            v128_t d = vsx_v128_load((v128_t*)dst);
+            v128_t t = vsx_i32x4_add(s1, s2);
+            v128_t w = vsx_i32x4_shr(t, e);
+            d = vsx_i32x4_add(d, w);
+            vsx_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = vsx_v128_load((v128_t*)src1);
+            v128_t s2 = vsx_v128_load((v128_t*)src2);
+            v128_t d = vsx_v128_load((v128_t*)dst);
+            v128_t t = vsx_i32x4_add(s1, s2);
+            v128_t w = vsx_i32x4_shr(t, e);
+            d = vsx_i32x4_sub(d, w);
+            vsx_v128_store((v128_t*)dst, d);
+          }
+      }
+      else if (a == -1)
+      { // any case with a == -1, which is not 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = vsx_v128_load((v128_t*)src1);
+            v128_t s2 = vsx_v128_load((v128_t*)src2);
+            v128_t d = vsx_v128_load((v128_t*)dst);
+            v128_t t = vsx_i32x4_add(s1, s2);
+            v128_t v = vsx_i32x4_sub(vb, t);
+            v128_t w = vsx_i32x4_shr(v, e);
+            d = vsx_i32x4_sub(d, w);
+            vsx_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = vsx_v128_load((v128_t*)src1);
+            v128_t s2 = vsx_v128_load((v128_t*)src2);
+            v128_t d = vsx_v128_load((v128_t*)dst);
+            v128_t t = vsx_i32x4_add(s1, s2);
+            v128_t v = vsx_i32x4_sub(vb, t);
+            v128_t w = vsx_i32x4_shr(v, e);
+            d = vsx_i32x4_add(d, w);
+            vsx_v128_store((v128_t*)dst, d);
+          }
+      }
+      else 
+      { // general case
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = vsx_v128_load((v128_t*)src1);
+            v128_t s2 = vsx_v128_load((v128_t*)src2);
+            v128_t d = vsx_v128_load((v128_t*)dst);
+            v128_t t = vsx_i32x4_add(s1, s2);
+            v128_t u = vsx_i32x4_mul(va, t);
+            v128_t v = vsx_i32x4_add(vb, u);
+            v128_t w = vsx_i32x4_shr(v, e);
+            d = vsx_i32x4_sub(d, w);
+            vsx_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = vsx_v128_load((v128_t*)src1);
+            v128_t s2 = vsx_v128_load((v128_t*)src2);
+            v128_t d = vsx_v128_load((v128_t*)dst);
+            v128_t t = vsx_i32x4_add(s1, s2);
+            v128_t u = vsx_i32x4_mul(va, t);
+            v128_t v = vsx_i32x4_add(vb, u);
+            v128_t w = vsx_i32x4_shr(v, e);
+            d = vsx_i32x4_add(d, w);
+            vsx_v128_store((v128_t*)dst, d);
+          }
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void vsx_rev_vert_step64(const lifting_step* s, const line_buf* sig, 
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis)
+    {
+      const si32 a = s->rev.Aatk;
+      const si32 b = s->rev.Batk;
+      const ui8 e = s->rev.Eatk;
+      v128_t va = vsx_i64x2_splat(a);
+      v128_t vb = vsx_i64x2_splat(b);
+
+      si64* dst = aug->i64;
+      const si64* src1 = sig->i64, * src2 = other->i64;
+      // The general definition of the wavelet in Part 2 is slightly 
+      // different to part 2, although they are mathematically equivalent
+      // here, we identify the simpler form from Part 1 and employ them
+      if (a == 1)
+      { // 5/3 update and any case with a == 1
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = vsx_v128_load((v128_t*)src1);
+            v128_t s2 = vsx_v128_load((v128_t*)src2);
+            v128_t d = vsx_v128_load((v128_t*)dst);
+            v128_t t = vsx_i64x2_add(s1, s2);
+            v128_t v = vsx_i64x2_add(vb, t);
+            v128_t w = vsx_i64x2_shr(v, e);
+            d = vsx_i64x2_sub(d, w);
+            vsx_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = vsx_v128_load((v128_t*)src1);
+            v128_t s2 = vsx_v128_load((v128_t*)src2);
+            v128_t d = vsx_v128_load((v128_t*)dst);
+            v128_t t = vsx_i64x2_add(s1, s2);
+            v128_t v = vsx_i64x2_add(vb, t);
+            v128_t w = vsx_i64x2_shr(v, e);
+            d = vsx_i64x2_add(d, w);
+            vsx_v128_store((v128_t*)dst, d);
+          }
+      }
+      else if (a == -1 && b == 1 && e == 1)
+      { // 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = vsx_v128_load((v128_t*)src1);
+            v128_t s2 = vsx_v128_load((v128_t*)src2);
+            v128_t d = vsx_v128_load((v128_t*)dst);
+            v128_t t = vsx_i64x2_add(s1, s2);
+            v128_t w = vsx_i64x2_shr(t, e);
+            d = vsx_i64x2_add(d, w);
+            vsx_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = vsx_v128_load((v128_t*)src1);
+            v128_t s2 = vsx_v128_load((v128_t*)src2);
+            v128_t d = vsx_v128_load((v128_t*)dst);
+            v128_t t = vsx_i64x2_add(s1, s2);
+            v128_t w = vsx_i64x2_shr(t, e);
+            d = vsx_i64x2_sub(d, w);
+            vsx_v128_store((v128_t*)dst, d);
+          }
+      }
+      else if (a == -1)
+      { // any case with a == -1, which is not 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = vsx_v128_load((v128_t*)src1);
+            v128_t s2 = vsx_v128_load((v128_t*)src2);
+            v128_t d = vsx_v128_load((v128_t*)dst);
+            v128_t t = vsx_i64x2_add(s1, s2);
+            v128_t v = vsx_i64x2_sub(vb, t);
+            v128_t w = vsx_i64x2_shr(v, e);
+            d = vsx_i64x2_sub(d, w);
+            vsx_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = vsx_v128_load((v128_t*)src1);
+            v128_t s2 = vsx_v128_load((v128_t*)src2);
+            v128_t d = vsx_v128_load((v128_t*)dst);
+            v128_t t = vsx_i64x2_add(s1, s2);
+            v128_t v = vsx_i64x2_sub(vb, t);
+            v128_t w = vsx_i64x2_shr(v, e);
+            d = vsx_i64x2_add(d, w);
+            vsx_v128_store((v128_t*)dst, d);
+          }
+      }
+      else 
+      { // general case
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = vsx_v128_load((v128_t*)src1);
+            v128_t s2 = vsx_v128_load((v128_t*)src2);
+            v128_t d = vsx_v128_load((v128_t*)dst);
+            v128_t t = vsx_i64x2_add(s1, s2);
+            v128_t u = vsx_i64x2_mul(va, t);
+            v128_t v = vsx_i64x2_add(vb, u);
+            v128_t w = vsx_i64x2_shr(v, e);
+            d = vsx_i64x2_sub(d, w);
+            vsx_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = vsx_v128_load((v128_t*)src1);
+            v128_t s2 = vsx_v128_load((v128_t*)src2);
+            v128_t d = vsx_v128_load((v128_t*)dst);
+            v128_t t = vsx_i64x2_add(s1, s2);
+            v128_t u = vsx_i64x2_mul(va, t);
+            v128_t v = vsx_i64x2_add(vb, u);
+            v128_t w = vsx_i64x2_shr(v, e);
+            d = vsx_i64x2_add(d, w);
+            vsx_v128_store((v128_t*)dst, d);
+          }
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void vsx_rev_vert_step(const lifting_step* s, const line_buf* sig, 
+                            const line_buf* other, const line_buf* aug, 
+                            ui32 repeat, bool synthesis)
+    {
+      if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || 
+          ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) ||
+          ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_32BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_32BIT));
+        vsx_rev_vert_step32(s, sig, other, aug, repeat, synthesis);
+      }
+      else 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_64BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_64BIT));
+        vsx_rev_vert_step64(s, sig, other, aug, repeat, synthesis);
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void vsx_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, 
+                             const line_buf* hdst, const line_buf* src, 
+                             ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        // combine both lsrc and hsrc into dst
+        {
+          float* dpl = even ? ldst->f32 : hdst->f32;
+          float* dph = even ? hdst->f32 : ldst->f32;
+          float* sp = src->f32;
+          int w = (int)width;
+          vsx_deinterleave32(dpl, dph, sp, w);
+        }        
+
+        si32* hp = hdst->i32, * lp = ldst->i32;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
+        {
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+          v128_t va = vsx_i32x4_splat(a);
+          v128_t vb = vsx_i32x4_splat(b);
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const si32* sp = lp;
+          si32* dp = hp;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)h_width;
+            if (even)
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i32x4_add(s1, s2);
+                v128_t v = vsx_i32x4_add(vb, t);
+                v128_t w = vsx_i32x4_shr(v, e);
+                d = vsx_i32x4_add(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i32x4_add(s1, s2);
+                v128_t v = vsx_i32x4_add(vb, t);
+                v128_t w = vsx_i32x4_shr(v, e);
+                d = vsx_i32x4_add(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i32x4_add(s1, s2);
+                v128_t w = vsx_i32x4_shr(t, e);
+                d = vsx_i32x4_sub(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i32x4_add(s1, s2);
+                v128_t w = vsx_i32x4_shr(t, e);
+                d = vsx_i32x4_sub(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i32x4_add(s1, s2);
+                v128_t v = vsx_i32x4_sub(vb, t);
+                v128_t w = vsx_i32x4_shr(v, e);
+                d = vsx_i32x4_add(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i32x4_add(s1, s2);
+                v128_t v = vsx_i32x4_sub(vb, t);
+                v128_t w = vsx_i32x4_shr(v, e);
+                d = vsx_i32x4_add(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+          }
+          else 
+          { // general case
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i32x4_add(s1, s2);
+                v128_t u = vsx_i32x4_mul(va, t);
+                v128_t v = vsx_i32x4_add(vb, u);
+                v128_t w = vsx_i32x4_shr(v, e);
+                d = vsx_i32x4_add(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i32x4_add(s1, s2);
+                v128_t u = vsx_i32x4_mul(va, t);                
+                v128_t v = vsx_i32x4_add(vb, u);
+                v128_t w = vsx_i32x4_shr(v, e);
+                d = vsx_i32x4_add(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+          }
+
+          // swap buffers
+          si32* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
+        }
+      }
+      else {
+        if (even)
+          ldst->i32[0] = src->i32[0];
+        else
+          hdst->i32[0] = src->i32[0] << 1;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void vsx_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, 
+                             const line_buf* hdst, const line_buf* src, 
+                             ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        // combine both lsrc and hsrc into dst
+        {
+          double* dpl = (double*)(even ? ldst->p : hdst->p);
+          double* dph = (double*)(even ? hdst->p : ldst->p);
+          double* sp  = (double*)src->p;
+          int w = (int)width;
+          vsx_deinterleave64(dpl, dph, sp, w);
+        }        
+
+        si64* hp = hdst->i64, * lp = ldst->i64;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
+        {
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+          v128_t va = vsx_i64x2_splat(a);
+          v128_t vb = vsx_i64x2_splat(b);
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const si64* sp = lp;
+          si64* dp = hp;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)h_width;
+            if (even)
+            {
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i64x2_add(s1, s2);
+                v128_t v = vsx_i64x2_add(vb, t);
+                v128_t w = vsx_i64x2_shr(v, e);
+                d = vsx_i64x2_add(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i64x2_add(s1, s2);
+                v128_t v = vsx_i64x2_add(vb, t);
+                v128_t w = vsx_i64x2_shr(v, e);
+                d = vsx_i64x2_add(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i64x2_add(s1, s2);
+                v128_t w = vsx_i64x2_shr(t, e);
+                d = vsx_i64x2_sub(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i64x2_add(s1, s2);
+                v128_t w = vsx_i64x2_shr(t, e);
+                d = vsx_i64x2_sub(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i64x2_add(s1, s2);
+                v128_t v = vsx_i64x2_sub(vb, t);
+                v128_t w = vsx_i64x2_shr(v, e);
+                d = vsx_i64x2_add(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i64x2_add(s1, s2);
+                v128_t v = vsx_i64x2_sub(vb, t);
+                v128_t w = vsx_i64x2_shr(v, e);
+                d = vsx_i64x2_add(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+          }
+          else 
+          { // general case
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i64x2_add(s1, s2);
+                v128_t u = vsx_i64x2_mul(va, t);
+                v128_t v = vsx_i64x2_add(vb, u);
+                v128_t w = vsx_i64x2_shr(v, e);
+                d = vsx_i64x2_add(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i64x2_add(s1, s2);
+                v128_t u = vsx_i64x2_mul(va, t);                
+                v128_t v = vsx_i64x2_add(vb, u);
+                v128_t w = vsx_i64x2_shr(v, e);
+                d = vsx_i64x2_add(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+          }
+
+          // swap buffers
+          si64* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
+        }
+      }
+      else {
+        if (even)
+          ldst->i64[0] = src->i64[0];
+        else
+          hdst->i64[0] = src->i64[0] << 1;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void vsx_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                           const line_buf* hdst, const line_buf* src, 
+                           ui32 width, bool even)
+    {
+      if (src->flags & line_buf::LFT_32BIT) 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_32BIT));
+        vsx_rev_horz_ana32(atk, ldst, hdst, src, width, even);
+      }
+      else 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && 
+               (src == NULL || src->flags & line_buf::LFT_64BIT));
+        vsx_rev_horz_ana64(atk, ldst, hdst, src, width, even);
+      }
+    } 
+
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_rev_horz_syn32(const param_atk* atk, const line_buf* dst, 
+                             const line_buf* lsrc, const line_buf* hsrc, 
+                             ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        bool ev = even;
+        si32* oth = hsrc->i32, * aug = lsrc->i32;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          const lifting_step* s = atk->get_step(j);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+          v128_t va = vsx_i32x4_splat(a);
+          v128_t vb = vsx_i32x4_splat(b);
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const si32* sp = oth;
+          si32* dp = aug;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)aug_width;
+            if (ev)
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i32x4_add(s1, s2);
+                v128_t v = vsx_i32x4_add(vb, t);
+                v128_t w = vsx_i32x4_shr(v, e);
+                d = vsx_i32x4_sub(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i32x4_add(s1, s2);
+                v128_t v = vsx_i32x4_add(vb, t);
+                v128_t w = vsx_i32x4_shr(v, e);
+                d = vsx_i32x4_sub(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i32x4_add(s1, s2);
+                v128_t w = vsx_i32x4_shr(t, e);
+                d = vsx_i32x4_add(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i32x4_add(s1, s2);
+                v128_t w = vsx_i32x4_shr(t, e);
+                d = vsx_i32x4_add(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i32x4_add(s1, s2);
+                v128_t v = vsx_i32x4_sub(vb, t);
+                v128_t w = vsx_i32x4_shr(v, e);
+                d = vsx_i32x4_sub(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i32x4_add(s1, s2);
+                v128_t v = vsx_i32x4_sub(vb, t);
+                v128_t w = vsx_i32x4_shr(v, e);
+                d = vsx_i32x4_sub(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+          }
+          else 
+          { // general case
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i32x4_add(s1, s2);
+                v128_t u = vsx_i32x4_mul(va, t);
+                v128_t v = vsx_i32x4_add(vb, u);
+                v128_t w = vsx_i32x4_shr(v, e);
+                d = vsx_i32x4_sub(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i32x4_add(s1, s2);
+                v128_t u = vsx_i32x4_mul(va, t);                
+                v128_t v = vsx_i32x4_add(vb, u);
+                v128_t w = vsx_i32x4_shr(v, e);
+                d = vsx_i32x4_sub(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+          }
+
+          // swap buffers
+          si32* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
+        }
+
+        // combine both lsrc and hsrc into dst
+        {
+          float* dp = dst->f32;
+          float* spl = even ? lsrc->f32 : hsrc->f32;
+          float* sph = even ? hsrc->f32 : lsrc->f32;
+          int w = (int)width;
+          vsx_interleave32(dp, spl, sph, w);
+        }
+      }
+      else {
+        if (even)
+          dst->i32[0] = lsrc->i32[0];
+        else
+          dst->i32[0] = hsrc->i32[0] >> 1;
+      }
+    }
+    
+    //////////////////////////////////////////////////////////////////////////
+    void vsx_rev_horz_syn64(const param_atk* atk, const line_buf* dst, 
+                             const line_buf* lsrc, const line_buf* hsrc, 
+                             ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        bool ev = even;
+        si64* oth = hsrc->i64, * aug = lsrc->i64;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          const lifting_step* s = atk->get_step(j);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+          v128_t va = vsx_i64x2_splat(a);
+          v128_t vb = vsx_i64x2_splat(b);
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const si64* sp = oth;
+          si64* dp = aug;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)aug_width;
+            if (ev)
+            {
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i64x2_add(s1, s2);
+                v128_t v = vsx_i64x2_add(vb, t);
+                v128_t w = vsx_i64x2_shr(v, e);
+                d = vsx_i64x2_sub(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i64x2_add(s1, s2);
+                v128_t v = vsx_i64x2_add(vb, t);
+                v128_t w = vsx_i64x2_shr(v, e);
+                d = vsx_i64x2_sub(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i64x2_add(s1, s2);
+                v128_t w = vsx_i64x2_shr(t, e);
+                d = vsx_i64x2_add(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i64x2_add(s1, s2);
+                v128_t w = vsx_i64x2_shr(t, e);
+                d = vsx_i64x2_add(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i64x2_add(s1, s2);
+                v128_t v = vsx_i64x2_sub(vb, t);
+                v128_t w = vsx_i64x2_shr(v, e);
+                d = vsx_i64x2_sub(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i64x2_add(s1, s2);
+                v128_t v = vsx_i64x2_sub(vb, t);
+                v128_t w = vsx_i64x2_shr(v, e);
+                d = vsx_i64x2_sub(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+          }
+          else 
+          { // general case
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp - 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i64x2_add(s1, s2);
+                v128_t u = vsx_i64x2_mul(va, t);
+                v128_t v = vsx_i64x2_add(vb, u);
+                v128_t w = vsx_i64x2_shr(v, e);
+                d = vsx_i64x2_sub(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = vsx_v128_load((v128_t*)sp);
+                v128_t s2 = vsx_v128_load((v128_t*)(sp + 1));
+                v128_t d = vsx_v128_load((v128_t*)dp);
+                v128_t t = vsx_i64x2_add(s1, s2);
+                v128_t u = vsx_i64x2_mul(va, t);                
+                v128_t v = vsx_i64x2_add(vb, u);
+                v128_t w = vsx_i64x2_shr(v, e);
+                d = vsx_i64x2_sub(d, w);
+                vsx_v128_store((v128_t*)dp, d);
+              }
+          }
+
+          // swap buffers
+          si64* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
+        }
+
+        // combine both lsrc and hsrc into dst
+        {
+          double* dp  = (double*)dst->p;
+          double* spl = (double*)(even ? lsrc->p : hsrc->p);
+          double* sph = (double*)(even ? hsrc->p : lsrc->p);
+          int w = (int)width;
+          vsx_interleave64(dp, spl, sph, w);
+        }
+      }
+      else {
+        if (even)
+          dst->i64[0] = lsrc->i64[0];
+        else
+          dst->i64[0] = hsrc->i64[0] >> 1;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void vsx_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
+                           const line_buf* lsrc, const line_buf* hsrc, 
+                           ui32 width, bool even)
+    {
+      if (dst->flags & line_buf::LFT_32BIT) 
+      {
+        assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT));
+        vsx_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even);
+      }
+      else 
+      {
+        assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) &&
+               (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT));
+        vsx_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even);
+      }
+    } 
+
+  } // !local
+} // !ojph