diff --git a/.github/workflows/ccp-workflow.yml b/.github/workflows/ccp-workflow.yml index 3104ce51..e7fa60e8 100644 --- a/.github/workflows/ccp-workflow.yml +++ b/.github/workflows/ccp-workflow.yml @@ -13,6 +13,7 @@ on: options: - 'test_arm32' - 'test_s390x' + - 'test_ppc64le' default: 'test_arm32' log_level: description: 'Log level' @@ -32,7 +33,7 @@ jobs: # ] # name: ${{ matrix.system }} Build # runs-on: ${{ matrix.runner }} - # if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' )}} + # if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' || github.event.inputs.job_to_run != 'test_ppc64le' )}} # steps: # - uses: actions/checkout@v5 # - name: cmake @@ -51,7 +52,7 @@ jobs: ] name: ${{ matrix.system }} Build runs-on: ${{ matrix.runner }} - if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' )}} + if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' || github.event.inputs.job_to_run != 'test_ppc64le' )}} steps: - uses: actions/checkout@v5 - name: cmake @@ -70,7 +71,7 @@ jobs: # ] # name: ${{ matrix.system }} Build # runs-on: ${{ matrix.runner }} - # if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' )}} + # if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' || github.event.inputs.job_to_run != 'test_ppc64le' )}} # steps: # - uses: actions/checkout@v5 # - name: cmake @@ -90,7 +91,7 @@ jobs: # ] # name: ${{ matrix.system }} Build # runs-on: ${{ matrix.runner }} - # if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' )}} + # if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' || github.event.inputs.job_to_run != 'test_ppc64le' )}} # defaults: # run: # shell: msys2 {0} @@ -117,7 +118,7 @@ jobs: # ] # name: ${{ matrix.system }} Build # runs-on: ${{ matrix.runner }} - # if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' )}} + # if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' || github.event.inputs.job_to_run != 'test_ppc64le' )}} # steps: # - uses: actions/checkout@v5 # - name: cmake @@ -139,7 +140,7 @@ jobs: ] name: ${{ matrix.system }} Build and Test runs-on: ${{ matrix.runner }} - if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' )}} + if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' || github.event.inputs.job_to_run != 'test_ppc64le' )}} steps: - uses: actions/checkout@v5 - name: cmake @@ -161,7 +162,7 @@ jobs: ] name: ${{ matrix.system }} Build and Test runs-on: ${{ matrix.runner }} - if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' )}} + if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' || github.event.inputs.job_to_run != 'test_ppc64le' )}} steps: - uses: actions/checkout@v5 - name: cmake @@ -184,7 +185,7 @@ jobs: ] name: ${{ matrix.system }} Build and Test runs-on: ${{ matrix.runner }} - if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' )}} + if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' || github.event.inputs.job_to_run != 'test_ppc64le' )}} defaults: run: shell: msys2 {0} @@ -214,7 +215,7 @@ jobs: ] name: ${{ matrix.system }} Build and Test runs-on: ${{ matrix.runner }} - if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' )}} + if: ${{ github.event_name != 'workflow_dispatch' || !(github.event.inputs.job_to_run != 'test_arm32' || github.event.inputs.job_to_run != 'test_s390x' || github.event.inputs.job_to_run != 'test_ppc64le' )}} steps: - uses: actions/checkout@v5 - name: cmake @@ -266,3 +267,31 @@ jobs: cmake -DCMAKE_BUILD_TYPE=Release -DOJPH_BUILD_STREAM_EXPAND=ON -DOJPH_ENABLE_TIFF_SUPPORT=OFF -DOJPH_BUILD_TESTS=ON .. make ctest --output-on-failure + + test_powerpc64le: + name: Linux-ppc64le Build and Test + strategy: + fail-fast: false + matrix: + include: [ + { dist: ubuntu22.04 }, + { dist: ubuntu24.04 }, + { dist: ubuntu_latest }, + ] + runs-on: ubuntu-latest + if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.job_to_run == 'test_ppc64le' }} + steps: + - uses: actions/checkout@v5 + - uses: uraimo/run-on-arch-action@v3 + with: + arch: ppc64le + distro: ${{ matrix.dist }} + githubToken: ${{ github.token }} + install: | + apt-get update -q -y + apt-get install -q -y cmake make g++ libtiff-dev python3 + run: | + cd build + cmake -DCMAKE_BUILD_TYPE=Release -DOJPH_BUILD_STREAM_EXPAND=ON -DOJPH_ENABLE_TIFF_SUPPORT=OFF -DOJPH_BUILD_TESTS=ON .. + make + ctest --output-on-failure diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 0a13d3b2..c2079ff0 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -5,11 +5,13 @@ file(GLOB CODESTREAM_SSE2 "codestream/*_sse2.cpp") file(GLOB CODESTREAM_AVX "codestream/*_avx.cpp") file(GLOB CODESTREAM_AVX2 "codestream/*_avx2.cpp") file(GLOB CODESTREAM_WASM "codestream/*_wasm.cpp") +file(GLOB CODESTREAM_VSX "codestream/*_vsx.cpp") file(GLOB CODING "coding/*.cpp" "coding/*.h") file(GLOB CODING_SSSE3 "coding/*_ssse3.cpp") file(GLOB CODING_WASM "coding/*_wasm.cpp") file(GLOB CODING_AVX2 "coding/*_avx2.cpp") file(GLOB CODING_AVX512 "coding/*_avx512.cpp") +file(GLOB CODING_VSX "coding/*_vsx.cpp") file(GLOB COMMON "openjph/*.h") file(GLOB OTHERS "others/*.cpp" "others/*.c") file(GLOB TRANSFORM "transform/*.cpp" "transform/*.h") @@ -19,10 +21,11 @@ file(GLOB TRANSFORM_AVX "transform/*_avx.cpp") file(GLOB TRANSFORM_AVX2 "transform/*_avx2.cpp") file(GLOB TRANSFORM_AVX512 "transform/*_avx512.cpp") file(GLOB TRANSFORM_WASM "transform/*_wasm.cpp") +file(GLOB TRANSFORM_VSX "transform/*_vsx.cpp") -list(REMOVE_ITEM CODESTREAM ${CODESTREAM_SSE} ${CODESTREAM_SSE2} ${CODESTREAM_AVX} ${CODESTREAM_AVX2} ${CODESTREAM_WASM}) -list(REMOVE_ITEM CODING ${CODING_SSSE3} ${CODING_WASM} ${CODING_AVX2} ${CODING_AVX512}) -list(REMOVE_ITEM TRANSFORM ${TRANSFORM_SSE} ${TRANSFORM_SSE2} ${TRANSFORM_AVX} ${TRANSFORM_AVX2} ${TRANSFORM_AVX512} ${TRANSFORM_WASM}) +list(REMOVE_ITEM CODESTREAM ${CODESTREAM_SSE} ${CODESTREAM_SSE2} ${CODESTREAM_AVX} ${CODESTREAM_AVX2} ${CODESTREAM_WASM} ${CODESTREAM_VSX}) +list(REMOVE_ITEM CODING ${CODING_SSSE3} ${CODING_WASM} ${CODING_AVX2} ${CODING_AVX512} ${CODING_VSX}) +list(REMOVE_ITEM TRANSFORM ${TRANSFORM_SSE} ${TRANSFORM_SSE2} ${TRANSFORM_AVX} ${TRANSFORM_AVX2} ${TRANSFORM_AVX512} ${TRANSFORM_WASM} ${TRANSFORM_VSX}) list(APPEND SOURCES ${CODESTREAM} ${CODING} ${COMMON} ${OTHERS} ${TRANSFORM}) source_group("codestream" FILES ${CODESTREAM}) @@ -112,6 +115,22 @@ else() endif() + if (("${OJPH_TARGET_ARCH}" MATCHES "OJPH_ARCH_PPC64") + AND (CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le|powerpc64le")) + # native 128-bit VSX kernels (see ojph_simd_vsx.h). Supported + # targets are POWER9 (ISA 3.0) and newer, little-endian only. + # The block decoder dispatch is selective; see + # ojph_codeblock_fun.cpp. + list(APPEND SOURCES ${CODESTREAM_VSX} ${CODING_VSX} ${TRANSFORM_VSX}) + source_group("codestream" FILES ${CODESTREAM_VSX}) + source_group("coding" FILES ${CODING_VSX}) + source_group("transform" FILES ${TRANSFORM_VSX}) + set_source_files_properties(codestream/ojph_codestream_vsx.cpp PROPERTIES COMPILE_FLAGS "-mcpu=power9") + set_source_files_properties(coding/ojph_block_decoder_vsx.cpp PROPERTIES COMPILE_FLAGS "-mcpu=power9") + set_source_files_properties(transform/ojph_transform_vsx.cpp PROPERTIES COMPILE_FLAGS "-mcpu=power9") + set_source_files_properties(transform/ojph_colour_vsx.cpp PROPERTIES COMPILE_FLAGS "-mcpu=power9") + endif() + endif() endif() diff --git a/src/core/codestream/ojph_codeblock_fun.cpp b/src/core/codestream/ojph_codeblock_fun.cpp index 2bf0d057..0c0345fc 100644 --- a/src/core/codestream/ojph_codeblock_fun.cpp +++ b/src/core/codestream/ojph_codeblock_fun.cpp @@ -61,16 +61,19 @@ namespace ojph { void sse_mem_clear(void* addr, size_t count); void avx_mem_clear(void* addr, size_t count); void wasm_mem_clear(void* addr, size_t count); + void vsx_mem_clear(void* addr, size_t count); ////////////////////////////////////////////////////////////////////////// ui32 gen_find_max_val32(ui32* address); ui32 sse2_find_max_val32(ui32* address); ui32 avx2_find_max_val32(ui32* address); ui32 wasm_find_max_val32(ui32* address); + ui32 vsx_find_max_val32(ui32* address); ui64 gen_find_max_val64(ui64* address); ui64 sse2_find_max_val64(ui64* address); ui64 avx2_find_max_val64(ui64* address); ui64 wasm_find_max_val64(ui64* address); + ui64 vsx_find_max_val64(ui64* address); ////////////////////////////////////////////////////////////////////////// @@ -88,8 +91,12 @@ namespace ojph { float delta_inv, ui32 count, ui32* max_val); void wasm_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32* max_val); + void vsx_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val); void wasm_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32* max_val); + void vsx_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val); void gen_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, float delta_inv, ui32 count, ui64* max_val); @@ -99,6 +106,8 @@ namespace ojph { float delta_inv, ui32 count, ui64* max_val); void wasm_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, float delta_inv, ui32 count, ui64* max_val); + void vsx_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, ui64* max_val); ////////////////////////////////////////////////////////////////////////// void gen_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, @@ -115,8 +124,12 @@ namespace ojph { float delta, ui32 count); void wasm_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, float delta, ui32 count); + void vsx_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); void wasm_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, float delta, ui32 count); + void vsx_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); void gen_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, float delta, ui32 count); @@ -128,6 +141,8 @@ namespace ojph { float delta, ui32 count); void wasm_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, float delta, ui32 count); + void vsx_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, + float delta, ui32 count); void codeblock_fun::init(bool reversible) { @@ -246,6 +261,40 @@ namespace ojph { #elif defined(OJPH_ARCH_ARM) + #elif defined(OJPH_ARCH_PPC64LE) + + // 128-bit VSX kernels; see ojph_simd_vsx.h. + // The SIMD block decoder is used everywhere on POWER10 (ISA 3.1), + // where it beats the scalar decoder on all measured content. On + // POWER9 it wins for irreversible content (more magnitude bits + // per sample) but trails the scalar decoder slightly on + // reversible content, so it is dispatched only for the former. + if (get_cpu_ext_level() >= PPC_CPU_EXT_LEVEL_ARCH_3_1 || + (!reversible && + get_cpu_ext_level() >= PPC_CPU_EXT_LEVEL_ARCH_3_00)) + decode_cb32 = ojph_decode_codeblock_vsx; + if (get_cpu_ext_level() >= PPC_CPU_EXT_LEVEL_ARCH_3_00) { + find_max_val32 = vsx_find_max_val32; + mem_clear = vsx_mem_clear; + if (reversible) { + tx_to_cb32 = vsx_rev_tx_to_cb32; + tx_from_cb32 = vsx_rev_tx_from_cb32; + } + else { + tx_to_cb32 = vsx_irv_tx_to_cb32; + tx_from_cb32 = vsx_irv_tx_from_cb32; + } + find_max_val64 = vsx_find_max_val64; + if (reversible) { + tx_to_cb64 = vsx_rev_tx_to_cb64; + tx_from_cb64 = vsx_rev_tx_from_cb64; + } + else { + tx_to_cb64 = NULL; + tx_from_cb64 = gen_irv_tx_from_cb64; + } + } + #endif // !(defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386)) #endif // !OJPH_DISABLE_SIMD diff --git a/src/core/codestream/ojph_codestream_local.cpp b/src/core/codestream/ojph_codestream_local.cpp index bd523ec7..bd402844 100644 --- a/src/core/codestream/ojph_codestream_local.cpp +++ b/src/core/codestream/ojph_codestream_local.cpp @@ -640,7 +640,7 @@ namespace ojph { this->pre_alloc(); this->finalize_alloc(); - ui16 t = swap_bytes_if_le(JP2K_MARKER::SOC); + ui16 t = swap_bytes_if_le((ui16)JP2K_MARKER::SOC); if (file->write(&t, 2) != 2) OJPH_ERROR(0x00030022, "Error writing to file"); @@ -670,7 +670,7 @@ namespace ojph { OJPH_INT_TO_STRING(OPENJPH_VERSION_MINOR) "." OJPH_INT_TO_STRING(OPENJPH_VERSION_PATCH) "."; size_t len = strlen(buf); - *(ui16*)buf = swap_bytes_if_le(JP2K_MARKER::COM); + *(ui16*)buf = swap_bytes_if_le((ui16)JP2K_MARKER::COM); *(ui16*)(buf + 2) = swap_bytes_if_le((ui16)(len - 2)); //1 for General use (IS 8859-15:1999 (Latin) values) *(ui16*)(buf + 4) = swap_bytes_if_le((ui16)(1)); @@ -680,7 +680,7 @@ namespace ojph { if (comments != NULL) { for (ui32 i = 0; i < num_comments; ++i) { - t = swap_bytes_if_le(JP2K_MARKER::COM); + t = swap_bytes_if_le((ui16)JP2K_MARKER::COM); if (file->write(&t, 2) != 2) OJPH_ERROR(0x00030029, "Error writing to file"); t = swap_bytes_if_le((ui16)(comments[i].len + 4)); @@ -1151,7 +1151,7 @@ namespace ojph { } for (si32 i = 0; i < repeat; ++i) tiles[i].flush(outfile); - ui16 t = swap_bytes_if_le(JP2K_MARKER::EOC); + ui16 t = swap_bytes_if_le((ui16)JP2K_MARKER::EOC); if (!outfile->write(&t, 2)) OJPH_ERROR(0x00030071, "Error writing to file"); } diff --git a/src/core/codestream/ojph_codestream_vsx.cpp b/src/core/codestream/ojph_codestream_vsx.cpp new file mode 100644 index 00000000..9c668e00 --- /dev/null +++ b/src/core/codestream/ojph_codestream_vsx.cpp @@ -0,0 +1,289 @@ +//***************************************************************************/ +// This software is released under the 2-Clause BSD license, included +// below. +// +// Copyright (c) 2022, Aous Naman +// Copyright (c) 2022, Kakadu Software Pty Ltd, Australia +// Copyright (c) 2022, The University of New South Wales, Australia +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************/ +// This file is part of the OpenJPH software implementation. +// File: ojph_codestream_vsx.cpp +// Author: Aous Naman +// Date: 15 May 2022 +//***************************************************************************/ + +#include +#include +#include "ojph_simd_vsx.h" + +#include "ojph_defs.h" + +namespace ojph { + namespace local { + + ////////////////////////////////////////////////////////////////////////// + void vsx_mem_clear(void* addr, size_t count) + { + float* p = (float*)addr; + v128_t zero = vsx_i32x4_splat(0); + for (size_t i = 0; i < count; i += 16, p += 4) + vsx_v128_store(p, zero); + } + + ////////////////////////////////////////////////////////////////////////// + ui32 vsx_find_max_val32(ui32* address) + { + v128_t x1, x0 = vsx_v128_load(address); + x1 = vsx_i32x4_shuffle(x0, x0, 2, 3, 2, 3); // x1 = x0[2,3,2,3] + x0 = vsx_v128_or(x0, x1); + x1 = vsx_i32x4_shuffle(x0, x0, 1, 1, 1, 1); // x1 = x0[1,1,1,1] + x0 = vsx_v128_or(x0, x1); + ui32 t = (ui32)vsx_i32x4_extract_lane(x0, 0); + return t; + } + + ////////////////////////////////////////////////////////////////////////// + ui64 vsx_find_max_val64(ui64* address) + { + v128_t x1, x0 = vsx_v128_load(address); + x1 = vsx_i64x2_shuffle(x0, x0, 1, 1); // x1 = x0[2,3,2,3] + x0 = vsx_v128_or(x0, x1); + ui64 t = (ui64)vsx_i64x2_extract_lane(x0, 0); + return t; + } + + ////////////////////////////////////////////////////////////////////////// + void vsx_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val) + { + ojph_unused(delta_inv); + + // convert to sign and magnitude and keep max_val + ui32 shift = 31 - K_max; + v128_t m0 = vsx_i32x4_splat(INT_MIN); + v128_t zero = vsx_i32x4_splat(0); + v128_t one = vsx_i32x4_splat(1); + v128_t tmax = vsx_v128_load(max_val); + si32 *p = (si32*)sp; + for ( ; count >= 4; count -= 4, p += 4, dp += 4) + { + v128_t v = vsx_v128_load(p); + v128_t sign = vsx_i32x4_lt(v, zero); + v128_t val = vsx_v128_xor(v, sign); // negate 1's complement + v128_t ones = vsx_v128_and(sign, one); + val = vsx_i32x4_add(val, ones); // 2's complement + sign = vsx_v128_and(sign, m0); + val = vsx_i32x4_shl(val, shift); + tmax = vsx_v128_or(tmax, val); + val = vsx_v128_or(val, sign); + vsx_v128_store(dp, val); + } + if (count) + { + v128_t v = vsx_v128_load(p); + v128_t sign = vsx_i32x4_lt(v, zero); + v128_t val = vsx_v128_xor(v, sign); // negate 1's complement + v128_t ones = vsx_v128_and(sign, one); + val = vsx_i32x4_add(val, ones); // 2's complement + sign = vsx_v128_and(sign, m0); + val = vsx_i32x4_shl(val, shift); + + v128_t c = vsx_i32x4_splat((si32)count); + v128_t idx = vsx_i32x4_make(0, 1, 2, 3); + v128_t mask = vsx_i32x4_gt(c, idx); + c = vsx_v128_and(val, mask); + tmax = vsx_v128_or(tmax, c); + + val = vsx_v128_or(val, sign); + vsx_v128_store(dp, val); + } + vsx_v128_store(max_val, tmax); + } + + ////////////////////////////////////////////////////////////////////////// + void vsx_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val) + { + ojph_unused(K_max); + + //quantize and convert to sign and magnitude and keep max_val + + v128_t d = vsx_f32x4_splat(delta_inv); + v128_t zero = vsx_i32x4_splat(0); + v128_t one = vsx_i32x4_splat(1); + v128_t tmax = vsx_v128_load(max_val); + float *p = (float*)sp; + for ( ; count >= 4; count -= 4, p += 4, dp += 4) + { + v128_t vf = vsx_v128_load(p); + vf = vsx_f32x4_mul(vf, d); // multiply + v128_t val = vsx_i32x4_trunc_sat_f32x4(vf); // convert to signed int + v128_t sign = vsx_i32x4_lt(val, zero); // get sign + val = vsx_v128_xor(val, sign); // negate 1's complement + v128_t ones = vsx_v128_and(sign, one); + val = vsx_i32x4_add(val, ones); // 2's complement + tmax = vsx_v128_or(tmax, val); + sign = vsx_i32x4_shl(sign, 31); + val = vsx_v128_or(val, sign); + vsx_v128_store(dp, val); + } + if (count) + { + v128_t vf = vsx_v128_load(p); + vf = vsx_f32x4_mul(vf, d); // multiply + v128_t val = vsx_i32x4_trunc_sat_f32x4(vf); // convert to signed int + v128_t sign = vsx_i32x4_lt(val, zero); // get sign + val = vsx_v128_xor(val, sign); // negate 1's complement + v128_t ones = vsx_v128_and(sign, one); + val = vsx_i32x4_add(val, ones); // 2's complement + + v128_t c = vsx_i32x4_splat((si32)count); + v128_t idx = vsx_i32x4_make(0, 1, 2, 3); + v128_t mask = vsx_i32x4_gt(c, idx); + c = vsx_v128_and(val, mask); + tmax = vsx_v128_or(tmax, c); + + sign = vsx_i32x4_shl(sign, 31); + val = vsx_v128_or(val, sign); + vsx_v128_store(dp, val); + } + vsx_v128_store(max_val, tmax); + } + + ////////////////////////////////////////////////////////////////////////// + void vsx_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count) + { + ojph_unused(delta); + ui32 shift = 31 - K_max; + v128_t m1 = vsx_i32x4_splat(INT_MAX); + v128_t zero = vsx_i32x4_splat(0); + v128_t one = vsx_i32x4_splat(1); + si32 *p = (si32*)dp; + for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4) + { + v128_t v = vsx_v128_load((v128_t*)sp); + v128_t val = vsx_v128_and(v, m1); + val = vsx_i32x4_shr(val, shift); + v128_t sign = vsx_i32x4_lt(v, zero); + val = vsx_v128_xor(val, sign); // negate 1's complement + v128_t ones = vsx_v128_and(sign, one); + val = vsx_i32x4_add(val, ones); // 2's complement + vsx_v128_store(p, val); + } + } + + ////////////////////////////////////////////////////////////////////////// + void vsx_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count) + { + ojph_unused(K_max); + v128_t m1 = vsx_i32x4_splat(INT_MAX); + v128_t d = vsx_f32x4_splat(delta); + float *p = (float*)dp; + for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4) + { + v128_t v = vsx_v128_load((v128_t*)sp); + v128_t vali = vsx_v128_and(v, m1); + v128_t valf = vsx_f32x4_convert_i32x4(vali); + valf = vsx_f32x4_mul(valf, d); + v128_t sign = vsx_v128_andnot(v, m1); + valf = vsx_v128_or(valf, sign); + vsx_v128_store(p, valf); + } + } + + ////////////////////////////////////////////////////////////////////////// + void vsx_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, ui64* max_val) + { + ojph_unused(delta_inv); + + // convert to sign and magnitude and keep max_val + ui32 shift = 63 - K_max; + v128_t m0 = vsx_i64x2_splat(LLONG_MIN); + v128_t zero = vsx_i64x2_splat(0); + v128_t one = vsx_i64x2_splat(1); + v128_t tmax = vsx_v128_load(max_val); + si64 *p = (si64*)sp; + for ( ; count >= 2; count -= 2, p += 2, dp += 2) + { + v128_t v = vsx_v128_load(p); + v128_t sign = vsx_i64x2_lt(v, zero); + v128_t val = vsx_v128_xor(v, sign); // negate 1's complement + v128_t ones = vsx_v128_and(sign, one); + val = vsx_i64x2_add(val, ones); // 2's complement + sign = vsx_v128_and(sign, m0); + val = vsx_i64x2_shl(val, shift); + tmax = vsx_v128_or(tmax, val); + val = vsx_v128_or(val, sign); + vsx_v128_store(dp, val); + } + if (count) + { + v128_t v = vsx_v128_load(p); + v128_t sign = vsx_i64x2_lt(v, zero); + v128_t val = vsx_v128_xor(v, sign); // negate 1's complement + v128_t ones = vsx_v128_and(sign, one); + val = vsx_i64x2_add(val, ones); // 2's complement + sign = vsx_v128_and(sign, m0); + val = vsx_i64x2_shl(val, shift); + + v128_t c = vsx_i32x4_make((si32)0xFFFFFFFF, (si32)0xFFFFFFFF, 0, 0); + c = vsx_v128_and(val, c); + tmax = vsx_v128_or(tmax, c); + + val = vsx_v128_or(val, sign); + vsx_v128_store(dp, val); + } + + vsx_v128_store(max_val, tmax); + } + + ////////////////////////////////////////////////////////////////////////// + void vsx_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, + float delta, ui32 count) + { + ojph_unused(delta); + ui32 shift = 63 - K_max; + v128_t m1 = vsx_i64x2_splat(LLONG_MAX); + v128_t zero = vsx_i64x2_splat(0); + v128_t one = vsx_i64x2_splat(1); + si64 *p = (si64*)dp; + for (ui32 i = 0; i < count; i += 2, sp += 2, p += 2) + { + v128_t v = vsx_v128_load((v128_t*)sp); + v128_t val = vsx_v128_and(v, m1); + val = vsx_i64x2_shr(val, shift); + v128_t sign = vsx_i64x2_lt(v, zero); + val = vsx_v128_xor(val, sign); // negate 1's complement + v128_t ones = vsx_v128_and(sign, one); + val = vsx_i64x2_add(val, ones); // 2's complement + vsx_v128_store(p, val); + } + } + } +} diff --git a/src/core/codestream/ojph_tile.cpp b/src/core/codestream/ojph_tile.cpp index 9536c608..b866fd4c 100644 --- a/src/core/codestream/ojph_tile.cpp +++ b/src/core/codestream/ojph_tile.cpp @@ -594,7 +594,7 @@ namespace ojph { OJPH_ERROR(0x00030081, "Error writing to file"); //write start of data - ui16 t = swap_bytes_if_le(JP2K_MARKER::SOD); + ui16 t = swap_bytes_if_le((ui16)JP2K_MARKER::SOD); if (!file->write(&t, 2)) OJPH_ERROR(0x00030082, "Error writing to file"); } @@ -622,7 +622,7 @@ namespace ojph { OJPH_ERROR(0x00030083, "Error writing to file"); //write start of data - ui16 t = swap_bytes_if_le(JP2K_MARKER::SOD); + ui16 t = swap_bytes_if_le((ui16)JP2K_MARKER::SOD); if (!file->write(&t, 2)) OJPH_ERROR(0x00030084, "Error writing to file"); @@ -642,7 +642,7 @@ namespace ojph { (ui8)(c + r * num_comps), (ui8)num_tileparts)) OJPH_ERROR(0x00030085, "Error writing to file"); //write start of data - ui16 t = swap_bytes_if_le(JP2K_MARKER::SOD); + ui16 t = swap_bytes_if_le((ui16)JP2K_MARKER::SOD); if (!file->write(&t, 2)) OJPH_ERROR(0x00030086, "Error writing to file"); comps[c].write_precincts(r, file); @@ -663,7 +663,7 @@ namespace ojph { OJPH_ERROR(0x00030087, "Error writing to file"); //write start of data - ui16 t = swap_bytes_if_le(JP2K_MARKER::SOD); + ui16 t = swap_bytes_if_le((ui16)JP2K_MARKER::SOD); if (!file->write(&t, 2)) OJPH_ERROR(0x00030088, "Error writing to file"); } @@ -738,7 +738,7 @@ namespace ojph { OJPH_ERROR(0x0003008A, "Error writing to file"); //write start of data - ui16 t = swap_bytes_if_le(JP2K_MARKER::SOD); + ui16 t = swap_bytes_if_le((ui16)JP2K_MARKER::SOD); if (!file->write(&t, 2)) OJPH_ERROR(0x0003008B, "Error writing to file"); } diff --git a/src/core/coding/ojph_block_decoder.h b/src/core/coding/ojph_block_decoder.h index a1970174..9b7bb19e 100644 --- a/src/core/coding/ojph_block_decoder.h +++ b/src/core/coding/ojph_block_decoder.h @@ -77,6 +77,12 @@ namespace ojph { ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, ui32 width, ui32 height, ui32 stride, bool stripe_causal); + // POWER VSX-accelerated decoder + bool + ojph_decode_codeblock_vsx(ui8* coded_data, ui32* decoded_data, + ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, + ui32 width, ui32 height, ui32 stride, bool stripe_causal); + } } diff --git a/src/core/coding/ojph_block_decoder_vsx.cpp b/src/core/coding/ojph_block_decoder_vsx.cpp new file mode 100644 index 00000000..4ddcde08 --- /dev/null +++ b/src/core/coding/ojph_block_decoder_vsx.cpp @@ -0,0 +1,2226 @@ +//***************************************************************************/ +// This software is released under the 2-Clause BSD license, included +// below. +// +// Copyright (c) 2022, Aous Naman +// Copyright (c) 2022, Kakadu Software Pty Ltd, Australia +// Copyright (c) 2022, The University of New South Wales, Australia +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************/ +// This file is part of the OpenJPH software implementation. +// File: ojph_block_decoder_vsx.cpp +// Author: Aous Naman +// Date: 13 May 2022 +//***************************************************************************/ + +//***************************************************************************/ +/** @file ojph_block_decoder_vsx.cpp + * @brief implements a faster HTJ2K block decoder using POWER VSX simd + */ + +#include +#include + +#include +#include +#include "ojph_block_common.h" +#include "ojph_block_decoder.h" +#include "ojph_arch.h" +#include "ojph_message.h" + +#include "ojph_simd_vsx.h" + +namespace ojph { + namespace local { + + //************************************************************************/ + /** @brief Macros that help with typing and space + */ + #define OJPH_REPEAT2(a) a,a + #define OJPH_REPEAT4(a) a,a,a,a + #define OJPH_REPEAT8(a) a,a,a,a,a,a,a,a + #define OJPH_REPEAT16(a) a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a + + //************************************************************************/ + /** @brief MEL state structure for reading and decoding the MEL bitstream + * + * A number of events is decoded from the MEL bitstream ahead of time + * and stored in run/num_runs. + * Each run represents the number of zero events before a one event. + */ + struct dec_mel_st { + dec_mel_st() : data(NULL), tmp(0), bits(0), size(0), unstuff(false), + k(0), num_runs(0), runs(0) + {} + // data decoding machinery + ui8* data; //!bits > 32) //there are enough bits in the tmp variable + return; // return without reading new data + + ui32 val = 0xFFFFFFFF; // feed in 0xFF if buffer is exhausted + if (melp->size > 4) { // if there is data in the MEL segment + memcpy(&val, melp->data, sizeof(val)); // read 32 bits from MEL data + melp->data += 4; // advance pointer + melp->size -= 4; // reduce counter + } + else if (melp->size > 0) + { // 4 or less + int i = 0; + while (melp->size > 1) { + ui32 v = *melp->data++; // read one byte at a time + ui32 m = ~(0xFFu << i); // mask of location + val = (val & m) | (v << i);// put one byte in its correct location + --melp->size; + i += 8; + } + // size equal to 1 + ui32 v = *melp->data++; // the one before the last is different + v |= 0xF; // MEL and VLC segments can overlap + ui32 m = ~(0xFFu << i); + val = (val & m) | (v << i); + --melp->size; + } + + // next we unstuff them before adding them to the buffer + int bits = 32 - melp->unstuff; // number of bits in val, subtract 1 if + // the previously read byte requires + // unstuffing + + // data is unstuffed and accumulated in t + // bits has the number of bits in t + ui32 t = val & 0xFF; + bool unstuff = ((val & 0xFF) == 0xFF); // true if we need unstuffing + bits -= unstuff; // there is one less bit in t if unstuffing is needed + t = t << (8 - unstuff); // move up to make room for the next byte + + //this is a repeat of the above + t |= (val>>8) & 0xFF; + unstuff = (((val >> 8) & 0xFF) == 0xFF); + bits -= unstuff; + t = t << (8 - unstuff); + + t |= (val>>16) & 0xFF; + unstuff = (((val >> 16) & 0xFF) == 0xFF); + bits -= unstuff; + t = t << (8 - unstuff); + + t |= (val>>24) & 0xFF; + melp->unstuff = (((val >> 24) & 0xFF) == 0xFF); + + // move t to tmp, and push the result all the way up, so we read from + // the MSB + melp->tmp |= ((ui64)t) << (64 - bits - melp->bits); + melp->bits += bits; //increment the number of bits in tmp + } + + //************************************************************************/ + /** @brief Decodes unstuffed MEL segment bits stored in tmp to runs + * + * Runs are stored in "runs" and the number of runs in "num_runs". + * Each run represents a number of zero events that may or may not + * terminate in a 1 event. + * Each run is stored in 7 bits. The LSB is 1 if the run terminates in + * a 1 event, 0 otherwise. The next 6 bits, for the case terminating + * with 1, contain the number of consecutive 0 zero events * 2; for the + * case terminating with 0, they store (number of consecutive 0 zero + * events - 1) * 2. + * A total of 6 bits (made up of 1 + 5) should have been enough. + * + * @param [in] melp is a pointer to dec_mel_st structure + */ + static inline + void mel_decode(dec_mel_st *melp) + { + static const int mel_exp[13] = { //MEL exponents + 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5 + }; + + if (melp->bits < 6) // if there are less than 6 bits in tmp + mel_read(melp); // then read from the MEL bitstream + // 6 bits is the largest decodable MEL cwd + + //repeat so long that there is enough decodable bits in tmp, + // and the runs store is not full (num_runs < 8) + while (melp->bits >= 6 && melp->num_runs < 8) + { + int eval = mel_exp[melp->k]; // number of bits associated with state + int run = 0; + if (melp->tmp & (1ull<<63)) //The next bit to decode (stored in MSB) + { //one is found + run = 1 << eval; + run--; // consecutive runs of 0 events - 1 + melp->k = melp->k + 1 < 12 ? melp->k + 1 : 12;//increment, max is 12 + melp->tmp <<= 1; // consume one bit from tmp + melp->bits -= 1; + run = run << 1; // a stretch of zeros not terminating in one + } + else + { //0 is found + run = (int)(melp->tmp >> (63 - eval)) & ((1 << eval) - 1); + melp->k = melp->k - 1 > 0 ? melp->k - 1 : 0; //decrement, min is 0 + melp->tmp <<= eval + 1; //consume eval + 1 bits (max is 6) + melp->bits -= eval + 1; + run = (run << 1) + 1; // a stretch of zeros terminating with one + } + eval = melp->num_runs * 7; // 7 bits per run + melp->runs &= ~((ui64)0x3F << eval); // 6 bits are sufficient + melp->runs |= ((ui64)run) << eval; // store the value in runs + melp->num_runs++; // increment count + } + } + + //************************************************************************/ + /** @brief Initiates a dec_mel_st structure for MEL decoding and reads + * some bytes in order to get the read address to a multiple + * of 4 + * + * @param [in] melp is a pointer to dec_mel_st structure + * @param [in] bbuf is a pointer to byte buffer + * @param [in] lcup is the length of MagSgn+MEL+VLC segments + * @param [in] scup is the length of MEL+VLC segments + */ + static inline + void mel_init(dec_mel_st *melp, ui8* bbuf, int lcup, int scup) + { + melp->data = bbuf + lcup - scup; // move the pointer to the start of MEL + melp->bits = 0; // 0 bits in tmp + melp->tmp = 0; // + melp->unstuff = false; // no unstuffing + melp->size = scup - 1; // size is the length of MEL+VLC-1 + melp->k = 0; // 0 for state + melp->num_runs = 0; // num_runs is 0 + melp->runs = 0; // + + //This code is borrowed; original is for a different architecture + //These few lines take care of the case where data is not at a multiple + // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MEL segment + int num = 4 - (int)(intptr_t(melp->data) & 0x3); + for (int i = 0; i < num; ++i) { // this code is similar to mel_read + assert(melp->unstuff == false || melp->data[0] <= 0x8F); + ui64 d = (melp->size > 0) ? *melp->data : 0xFF;//if buffer is consumed + //set data to 0xFF + if (melp->size == 1) d |= 0xF; //if this is MEL+VLC-1, set LSBs to 0xF + // see the standard + melp->data += melp->size-- > 0; //increment if the end is not reached + int d_bits = 8 - melp->unstuff; //if unstuffing is needed, reduce by 1 + melp->tmp = (melp->tmp << d_bits) | d; //store bits in tmp + melp->bits += d_bits; //increment tmp by number of bits + melp->unstuff = ((d & 0xFF) == 0xFF); //true of next byte needs + //unstuffing + } + melp->tmp <<= (64 - melp->bits); //push all the way up so the first bit + // is the MSB + } + + //************************************************************************/ + /** @brief Retrieves one run from dec_mel_st; if there are no runs stored + * MEL segment is decoded + * + * @param [in] melp is a pointer to dec_mel_st structure + */ + static inline + int mel_get_run(dec_mel_st *melp) + { + if (melp->num_runs == 0) //if no runs, decode more bit from MEL segment + mel_decode(melp); + + int t = melp->runs & 0x7F; //retrieve one run + melp->runs >>= 7; // remove the retrieved run + melp->num_runs--; + return t; // return run + } + + //************************************************************************/ + /** @brief A structure for reading and unstuffing a segment that grows + * backward, such as VLC and MRP + */ + struct rev_struct { + rev_struct() : data(NULL), tmp(0), bits(0), size(0), unstuff(false) + {} + //storage + ui8* data; //!bits > 32) // if there are more than 32 bits in tmp, then + return; // reading 32 bits can overflow vlcp->tmp + ui32 val = 0; + //the next line (the if statement) needs to be tested first + if (vlcp->size > 3) // if there are more than 3 bytes left in VLC + { + // (vlcp->data - 3) move pointer back to read 32 bits at once + memcpy(&val, vlcp->data - 3, sizeof(val)); // then read 32 bits + vlcp->data -= 4; // move data pointer back by 4 + vlcp->size -= 4; // reduce available byte by 4 + } + else if (vlcp->size > 0) + { // 4 or less + int i = 24; + while (vlcp->size > 0) { + ui32 v = *vlcp->data--; // read one byte at a time + val |= (v << i); // put byte in its correct location + --vlcp->size; + i -= 8; + } + } + + //accumulate in tmp, number of bits in tmp are stored in bits + ui32 tmp = val >> 24; //start with the MSB byte + ui32 bits; + + // test unstuff (previous byte is >0x8F), and this byte is 0x7F + bits = 8 - ((vlcp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0); + bool unstuff = (val >> 24) > 0x8F; //this is for the next byte + + tmp |= ((val >> 16) & 0xFF) << bits; //process the next byte + bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0); + unstuff = ((val >> 16) & 0xFF) > 0x8F; + + tmp |= ((val >> 8) & 0xFF) << bits; + bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0); + unstuff = ((val >> 8) & 0xFF) > 0x8F; + + tmp |= (val & 0xFF) << bits; + bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0); + unstuff = (val & 0xFF) > 0x8F; + + // now move the read and unstuffed bits into vlcp->tmp + vlcp->tmp |= (ui64)tmp << vlcp->bits; + vlcp->bits += bits; + vlcp->unstuff = unstuff; // this for the next read + } + + //************************************************************************/ + /** @brief Initiates the rev_struct structure and reads a few bytes to + * move the read address to multiple of 4 + * + * There is another similar rev_init_mrp subroutine. The difference is + * that this one, rev_init, discards the first 12 bits (they have the + * sum of the lengths of VLC and MEL segments), and first unstuff depends + * on first 4 bits. + * + * @param [in] vlcp is a pointer to rev_struct structure + * @param [in] data is a pointer to byte at the start of the cleanup pass + * @param [in] lcup is the length of MagSgn+MEL+VLC segments + * @param [in] scup is the length of MEL+VLC segments + */ + static inline + void rev_init(rev_struct *vlcp, ui8* data, int lcup, int scup) + { + //first byte has only the upper 4 bits + vlcp->data = data + lcup - 2; + + //size can not be larger than this, in fact it should be smaller + vlcp->size = scup - 2; + + ui32 d = *vlcp->data--; // read one byte (this is a half byte) + vlcp->tmp = d >> 4; // both initialize and set + vlcp->bits = 4 - ((vlcp->tmp & 7) == 7); //check standard + vlcp->unstuff = (d | 0xF) > 0x8F; //this is useful for the next byte + + //This code is designed for an architecture that read address should + // align to the read size (address multiple of 4 if read size is 4) + //These few lines take care of the case where data is not at a multiple + // of 4 boundary. It reads 1,2,3 up to 4 bytes from the VLC bitstream. + // To read 32 bits, read from (vlcp->data - 3) + int num = 1 + (int)(intptr_t(vlcp->data) & 0x3); + int tnum = num < vlcp->size ? num : vlcp->size; + for (int i = 0; i < tnum; ++i) { + ui64 d; + d = *vlcp->data--; // read one byte and move read pointer + //check if the last byte was >0x8F (unstuff == true) and this is 0x7F + ui32 d_bits = 8 - ((vlcp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0); + vlcp->tmp |= d << vlcp->bits; // move data to vlcp->tmp + vlcp->bits += d_bits; + vlcp->unstuff = d > 0x8F; // for next byte + } + vlcp->size -= tnum; + rev_read(vlcp); // read another 32 buts + } + + //************************************************************************/ + /** @brief Retrieves 32 bits from the head of a rev_struct structure + * + * By the end of this call, vlcp->tmp must have no less than 33 bits + * + * @param [in] vlcp is a pointer to rev_struct structure + */ + static inline + ui32 rev_fetch(rev_struct *vlcp) + { + if (vlcp->bits < 32) // if there are less then 32 bits, read more + { + rev_read(vlcp); // read 32 bits, but unstuffing might reduce this + if (vlcp->bits < 32)// if there is still space in vlcp->tmp for 32 bits + rev_read(vlcp); // read another 32 + } + return (ui32)vlcp->tmp; // return the head (bottom-most) of vlcp->tmp + } + + //************************************************************************/ + /** @brief Consumes num_bits from a rev_struct structure + * + * @param [in] vlcp is a pointer to rev_struct structure + * @param [in] num_bits is the number of bits to be removed + */ + static inline + ui32 rev_advance(rev_struct *vlcp, ui32 num_bits) + { + assert(num_bits <= vlcp->bits); // vlcp->tmp must have more than num_bits + vlcp->tmp >>= num_bits; // remove bits + vlcp->bits -= num_bits; // decrement the number of bits + return (ui32)vlcp->tmp; + } + + //************************************************************************/ + /** @brief Reads and unstuffs from rev_struct + * + * This is different than rev_read in that this fills in zeros when the + * the available data is consumed. The other does not care about the + * values when all data is consumed. + * + * See rev_read for more information about unstuffing + * + * @param [in] mrp is a pointer to rev_struct structure + */ + static inline + void rev_read_mrp(rev_struct *mrp) + { + //process 4 bytes at a time + if (mrp->bits > 32) + return; + ui32 val = 0; + if (mrp->size > 3) // If there are 3 byte or more + { // (mrp->data - 3) move pointer back to read 32 bits at once + memcpy(&val, mrp->data - 3, sizeof(val)); // read 32 bits + mrp->data -= 4; // move back pointer + mrp->size -= 4; // reduce count + } + else if (mrp->size > 0) + { + int i = 24; + while (mrp->size > 0) { + ui32 v = *mrp->data--; // read one byte at a time + val |= (v << i); // put byte in its correct location + --mrp->size; + i -= 8; + } + } + + //accumulate in tmp, and keep count in bits + ui32 bits, tmp = val >> 24; + + //test if the last byte > 0x8F (unstuff must be true) and this is 0x7F + bits = 8 - ((mrp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0); + bool unstuff = (val >> 24) > 0x8F; + + //process the next byte + tmp |= ((val >> 16) & 0xFF) << bits; + bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0); + unstuff = ((val >> 16) & 0xFF) > 0x8F; + + tmp |= ((val >> 8) & 0xFF) << bits; + bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0); + unstuff = ((val >> 8) & 0xFF) > 0x8F; + + tmp |= (val & 0xFF) << bits; + bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0); + unstuff = (val & 0xFF) > 0x8F; + + mrp->tmp |= (ui64)tmp << mrp->bits; // move data to mrp pointer + mrp->bits += bits; + mrp->unstuff = unstuff; // next byte + } + + //************************************************************************/ + /** @brief Initialized rev_struct structure for MRP segment, and reads + * a number of bytes such that the next 32 bits read are from + * an address that is a multiple of 4. Note this is designed for + * an architecture that read size must be compatible with the + * alignment of the read address + * + * There is another similar subroutine rev_init. This subroutine does + * NOT skip the first 12 bits, and starts with unstuff set to true. + * + * @param [in] mrp is a pointer to rev_struct structure + * @param [in] data is a pointer to byte at the start of the cleanup pass + * @param [in] lcup is the length of MagSgn+MEL+VLC segments + * @param [in] len2 is the length of SPP+MRP segments + */ + static inline + void rev_init_mrp(rev_struct *mrp, ui8* data, int lcup, int len2) + { + mrp->data = data + lcup + len2 - 1; + mrp->size = len2; + mrp->unstuff = true; + mrp->bits = 0; + mrp->tmp = 0; + + //This code is designed for an architecture that read address should + // align to the read size (address multiple of 4 if read size is 4) + //These few lines take care of the case where data is not at a multiple + // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MRP stream + int num = 1 + (int)(intptr_t(mrp->data) & 0x3); + for (int i = 0; i < num; ++i) { + ui64 d; + //read a byte, 0 if no more data + d = (mrp->size-- > 0) ? *mrp->data-- : 0; + //check if unstuffing is needed + ui32 d_bits = 8 - ((mrp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0); + mrp->tmp |= d << mrp->bits; // move data to vlcp->tmp + mrp->bits += d_bits; + mrp->unstuff = d > 0x8F; // for next byte + } + rev_read_mrp(mrp); + } + + //************************************************************************/ + /** @brief Retrieves 32 bits from the head of a rev_struct structure + * + * By the end of this call, mrp->tmp must have no less than 33 bits + * + * @param [in] mrp is a pointer to rev_struct structure + */ + static inline + ui32 rev_fetch_mrp(rev_struct *mrp) + { + if (mrp->bits < 32) // if there are less than 32 bits in mrp->tmp + { + rev_read_mrp(mrp); // read 30-32 bits from mrp + if (mrp->bits < 32) // if there is a space of 32 bits + rev_read_mrp(mrp); // read more + } + return (ui32)mrp->tmp; // return the head of mrp->tmp + } + + //************************************************************************/ + /** @brief Consumes num_bits from a rev_struct structure + * + * @param [in] mrp is a pointer to rev_struct structure + * @param [in] num_bits is the number of bits to be removed + */ + inline ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits) + { + assert(num_bits <= mrp->bits); // we must not consume more than mrp->bits + mrp->tmp >>= num_bits; // discard the lowest num_bits bits + mrp->bits -= num_bits; + return (ui32)mrp->tmp; // return data after consumption + } + + //************************************************************************/ + /** @brief State structure for reading and unstuffing of forward-growing + * bitstreams; these are: MagSgn and SPP bitstreams + */ + struct frwd_struct { + const ui8* data; //! + static inline + void frwd_read(frwd_struct *msp) + { + assert(msp->bits <= 128); + + v128_t offset, val, validity, all_xff; + val = vsx_v128_load(msp->data); + int bytes = msp->size >= 16 ? 16 : msp->size; + validity = vsx_i8x16_splat((char)bytes); + msp->data += bytes; + msp->size -= bytes; + ui32 bits = 128; + offset = vsx_i64x2_const(0x0706050403020100,0x0F0E0D0C0B0A0908); + validity = vsx_i8x16_gt(validity, offset); + all_xff = vsx_i8x16_const(OJPH_REPEAT16(-1)); + if (X == 0xFF) // the compiler should remove this if statement + { + v128_t t = vsx_v128_xor(validity, all_xff); // complement + val = vsx_v128_or(t, val); // fill with 0xFF + } + else if (X == 0) + val = vsx_v128_and(validity, val); // fill with zeros + else + assert(0); + + v128_t ff_bytes; + ff_bytes = vsx_i8x16_eq(val, all_xff); + ff_bytes = vsx_v128_and(ff_bytes, validity); + ui32 flags = vsx_i8x16_bitmask(ff_bytes); + flags <<= 1; // unstuff following byte + ui32 next_unstuff = flags >> 16; + flags |= msp->unstuff; + flags &= 0xFFFF; + while (flags) + { // bit unstuffing occurs on average once every 256 bytes + // therefore it is not an issue if it is a bit slow + // here we process 16 bytes + --bits; // consuming one stuffing bit + + ui32 loc = 31 - count_leading_zeros(flags); + flags ^= 1 << loc; + + v128_t m, t, c; + t = vsx_i8x16_splat((char)loc); + m = vsx_i8x16_gt(offset, t); + + t = vsx_v128_and(m, val); // keep bits at locations larger than loc + c = vsx_u64x2_shr(t, 1); // 1 bits left + t = vsx_i64x2_shuffle(t, vsx_i64x2_const(0, 0), 1, 2); + t = vsx_i64x2_shl(t, 63); // keep the MSB only + t = vsx_v128_or(t, c); // combine the above 3 steps + + val = vsx_v128_or(t, vsx_v128_andnot(val, m)); + } + + // combine with earlier data + assert(msp->bits >= 0 && msp->bits <= 128); + int cur_bytes = msp->bits >> 3; + ui32 cur_bits = msp->bits & 7; + v128_t b1, b2; + b1 = vsx_i64x2_shl(val, cur_bits); + //next shift 8 bytes right + b2 = vsx_i64x2_shuffle(vsx_i64x2_const(0, 0), val, 1, 2); + b2 = vsx_u64x2_shr(b2, 64u - cur_bits); + b2 = (cur_bits > 0) ? b2 : vsx_i64x2_const(0, 0); + b1 = vsx_v128_or(b1, b2); + b2 = vsx_v128_load(msp->tmp + cur_bytes); + b2 = vsx_v128_or(b1, b2); + vsx_v128_store(msp->tmp + cur_bytes, b2); + + ui32 consumed_bits = bits < 128u - cur_bits ? bits : 128u - cur_bits; + cur_bytes = (msp->bits + consumed_bits + 7) >> 3; // round up + int upper = vsx_u16x8_extract_lane(val, 7); + upper >>= consumed_bits + 16 - 128; + msp->tmp[cur_bytes] = (ui8)upper; // copy byte + + msp->bits += bits; + msp->unstuff = next_unstuff; // next unstuff + assert(msp->unstuff == 0 || msp->unstuff == 1); + } + + //************************************************************************/ + /** @brief Initialize frwd_struct struct and reads some bytes + * + * @tparam X is the value fed in when the bitstream is exhausted. + * See frwd_read regarding the template + * @param [in] msp is a pointer to frwd_struct + * @param [in] data is a pointer to the start of data + * @param [in] size is the number of byte in the bitstream + */ + template + static inline + void frwd_init(frwd_struct *msp, const ui8* data, int size) + { + msp->data = data; + vsx_v128_store(msp->tmp, vsx_i64x2_const(0, 0)); + vsx_v128_store(msp->tmp + 16, vsx_i64x2_const(0, 0)); + vsx_v128_store(msp->tmp + 32, vsx_i64x2_const(0, 0)); + + msp->bits = 0; + msp->unstuff = 0; + msp->size = size; + + frwd_read(msp); // read 128 bits more + } + + //************************************************************************/ + /** @brief Consume num_bits bits from the bitstream of frwd_struct + * + * @param [in] msp is a pointer to frwd_struct + * @param [in] num_bits is the number of bit to consume + */ + static inline + void frwd_advance(frwd_struct *msp, ui32 num_bits) + { + assert(num_bits > 0 && num_bits <= msp->bits && num_bits < 128); + msp->bits -= num_bits; + + v128_t *p = (v128_t*)(msp->tmp + ((num_bits >> 3) & 0x18)); + num_bits &= 63; + + v128_t v0, v1, c0, c1, t; + v0 = vsx_v128_load(p); + v1 = vsx_v128_load(p + 1); + + // shift right by num_bits + c0 = vsx_u64x2_shr(v0, num_bits); + t = vsx_i64x2_shuffle(v0, vsx_i64x2_const(0, 0), 1, 2); + t = vsx_i64x2_shl(t, 64 - num_bits); + t = (num_bits > 0) ? t : vsx_i64x2_const(0, 0); + c0 = vsx_v128_or(c0, t); + t = vsx_i64x2_shuffle(vsx_i64x2_const(0, 0), v1, 1, 2); + t = vsx_i64x2_shl(t, 64 - num_bits); + t = (num_bits > 0) ? t : vsx_i64x2_const(0, 0); + c0 = vsx_v128_or(c0, t); + + vsx_v128_store(msp->tmp, c0); + + c1 = vsx_u64x2_shr(v1, num_bits); + t = vsx_i64x2_shuffle(v1, vsx_i64x2_const(0, 0), 1, 2); + t = vsx_i64x2_shl(t, 64 - num_bits); + t = (num_bits > 0) ? t : vsx_i64x2_const(0, 0); + c1 = vsx_v128_or(c1, t); + + vsx_v128_store(msp->tmp + 16, c1); + } + + //************************************************************************/ + /** @brief Fetches 32 bits from the frwd_struct bitstream + * + * @tparam X is the value fed in when the bitstream is exhausted. + * See frwd_read regarding the template + * @param [in] msp is a pointer to frwd_struct + */ + template + static inline + v128_t frwd_fetch(frwd_struct *msp) + { + if (msp->bits <= 128) + { + frwd_read(msp); + if (msp->bits <= 128) //need to test + frwd_read(msp); + } + v128_t t = vsx_v128_load(msp->tmp); + return t; + } + + //************************************************************************/ + /** @brief Destuffs a bitstream into a contiguous buffer, upfront + * + * Removes bit stuffing once for the whole stream, so that fetching + * bits at a given position later needs no serial state; this keeps + * the per-quad bit-consumption chain in general-purpose registers + * (pos += bits) instead of shifting a vector-resident window, which + * on POWER costs vector stores/reloads through the stack. + * + * @tparam X is the value fed in when the bitstream is exhausted + * @param [in] src is a pointer to the start of the bitstream + * @param [in] size is the number of bytes in the bitstream + * @param [in] dst is the destination buffer, of capacity cap + 72 + * @param [in] cap is the destination capacity, excluding padding + * @return the clamp offset; bytes at or beyond this offset hold + * no stream bits (they read as X) + */ + template + static inline + ui32 destuff_frwd(const ui8* src, int size, ui8* dst, ui32 cap) + { + if (size < 0) + size = 0; + ui8* o = dst; + ui8* o_end = dst + cap; + const ui8* s = src; + const ui8* s_end = src + size; + ui64 acc = 0; // partial output byte, low nb bits are valid + ui32 nb = 0; // number of valid bits in acc; always < 8 + bool prev_ff = false; + + // fast path; 16 source bytes at a time when they contain no 0xFF + while (s + 16 <= s_end && o + 24 <= o_end) + { + v128_t v = vsx_v128_load(s); + if (vec_any_eq((vsx_v_u8)v, vec_splats((unsigned char)0xFF)) + || prev_ff) + { // process these 16 bytes one at a time + for (int i = 0; i < 16; ++i) { + ui8 b = *s++; + acc |= (ui64)b << nb; + nb += prev_ff ? 7u : 8u; + prev_ff = (b == 0xFFu); + if (nb >= 8) { *o++ = (ui8)acc; acc >>= 8; nb -= 8; } + } + continue; + } + ui64 v0, v1; + memcpy(&v0, s, 8); + memcpy(&v1, s + 8, 8); + ui64 w0 = acc | (v0 << nb); + ui64 w1 = (v1 << nb) | (nb ? (v0 >> (64 - nb)) : 0); + memcpy(o, &w0, 8); + memcpy(o + 8, &w1, 8); + acc = nb ? (v1 >> (64 - nb)) : 0; + o += 16; + s += 16; + } + // tail; one byte at a time + while (s < s_end && o < o_end) + { + ui8 b = *s++; + acc |= (ui64)b << nb; + nb += prev_ff ? 7u : 8u; + prev_ff = (b == 0xFFu); + if (nb >= 8) { *o++ = (ui8)acc; acc >>= 8; nb -= 8; } + } + // fill the bits above nb with X, and pad with X bytes + ui32 fill = (X == 0xFF) ? (0xFFu << nb) : 0; + *o = (ui8)((ui32)acc | fill); + memset(o + 1, X, 64); + return (ui32)(o - dst) + 1; + } + + //************************************************************************/ + /** @brief Fetches 128 bits from a destuffed bitstream buffer + * + * Returns the 128 bits starting at bit position pos of the buffer + * produced by destuff_frwd. Carries no serial state; fetches at + * independent positions can execute out of order. The byte offset + * is clamped to limit so that positions past the end of the stream + * read from the padding without leaving the buffer. + * + * @param [in] dbuf is the destuffed bitstream buffer + * @param [in] limit is the clamp offset returned by destuff_frwd + * @param [in] pos is the absolute bit position to fetch from + */ + static inline + v128_t vsx_dfetch(const ui8* dbuf, ui32 limit, ui32 pos) + { + ui32 off = pos >> 3; + off = off < limit ? off : limit; + const ui8* p = dbuf + off; + v128_t v = vsx_v128_load(p); + v128_t w = vsx_v128_load(p + 8); + int k = (int)(pos & 7); + v128_t r = vsx_u64x2_shr(v, k); + // shift left by 64 - k without branching on k == 0; vector shifts + // are modulo 64, so the shift is split into 1 and 63 - k + v128_t c = vsx_i64x2_shl(vsx_i64x2_shl(w, 1), 63 - k); + return vsx_v128_or(r, c); + } + + //************************************************************************/ + /** @brief decodes one quad, using 32 bit data + * + * @tparam N 0 for the first quad and 1 for the second quad in an + * octet + * @param inf_u_q decoded VLC code, with interleaved u values + * @param U_q U values + * @param magsgn structure for forward data buffer + * @param p bitplane at which we are decoding + * @param vn used for handling E values (stores v_n values) + * @return v128_t decoded quad + */ + template + static inline + v128_t decode_one_quad32(const v128_t inf_u_q, v128_t U_q, + frwd_struct* magsgn, ui32 p, v128_t& vn) + { + v128_t w0; // workers + v128_t insig; // lanes hold FF's if samples are insignificant + v128_t flags; // lanes hold e_k, e_1, and rho + v128_t row; // decoded row + + row = vsx_i64x2_const(0, 0); + w0 = vsx_i32x4_shuffle(inf_u_q, inf_u_q, N, N, N, N); + // we keeps e_k, e_1, and rho in w2 + flags = vsx_v128_and(w0, vsx_i32x4_const(0x1110,0x2220,0x4440,0x8880)); + insig = vsx_i32x4_eq(flags, vsx_i64x2_const(0, 0)); + if (vsx_i8x16_bitmask(insig) != 0xFFFF) //are all insignificant? + { + U_q = vsx_i32x4_shuffle(U_q, U_q, N, N, N, N); + flags = vsx_i16x8_mul(flags, vsx_i16x8_const(8,8,4,4,2,2,1,1)); + v128_t ms_vec = frwd_fetch<0xFF>(magsgn); + + // U_q holds U_q for this quad + // flags has e_k, e_1, and rho such that e_k is sitting in the + // 0x8000, e_1 in 0x800, and rho in 0x80 + + // next e_k and m_n + v128_t m_n; + w0 = vsx_u32x4_shr(flags, 15); // e_k + m_n = vsx_i32x4_sub(U_q, w0); + m_n = vsx_v128_andnot(m_n, insig); + + // find cumulative sums + // to find at which bit in ms_vec the sample starts + v128_t ex_sum, shfl, inc_sum = m_n; // inclusive scan + shfl = vsx_i32x4_shuffle(vsx_i64x2_const(0,0), inc_sum, 3, 4, 5, 6); + inc_sum = vsx_i32x4_add(inc_sum, shfl); + shfl = vsx_i64x2_shuffle(vsx_i64x2_const(0,0), inc_sum, 1, 2); + inc_sum = vsx_i32x4_add(inc_sum, shfl); + int total_mn = vsx_u16x8_extract_lane(inc_sum, 6); + ex_sum = vsx_i32x4_shuffle(vsx_i64x2_const(0,0), inc_sum, 3, 4, 5, 6); + + // find the starting byte and starting bit + v128_t byte_idx = vsx_u32x4_shr(ex_sum, 3); + v128_t bit_idx = + vsx_v128_and(ex_sum, vsx_i32x4_const(OJPH_REPEAT4(7))); + byte_idx = vsx_i8x16_swizzle(byte_idx, + vsx_i32x4_const(0x00000000, 0x04040404, 0x08080808, 0x0C0C0C0C)); + byte_idx = + vsx_i32x4_add(byte_idx, vsx_i32x4_const(OJPH_REPEAT4(0x03020100))); + v128_t d0 = vsx_i8x16_swizzle(ms_vec, byte_idx); + byte_idx = + vsx_i32x4_add(byte_idx, vsx_i32x4_const(OJPH_REPEAT4(0x01010101))); + v128_t d1 = vsx_i8x16_swizzle(ms_vec, byte_idx); + + // shift samples values to correct location + bit_idx = vsx_v128_or(bit_idx, vsx_i32x4_shl(bit_idx, 16)); + v128_t bit_shift = vsx_i8x16_swizzle( + vsx_i8x16_const(-1, 127, 63, 31, 15, 7, 3, 1, + -1, 127, 63, 31, 15, 7, 3, 1), bit_idx); + bit_shift = + vsx_i16x8_add(bit_shift, vsx_i16x8_const(OJPH_REPEAT8(0x0101))); + d0 = vsx_i16x8_mul(d0, bit_shift); + d0 = vsx_u16x8_shr(d0, 8); // we should have 8 bits in the LSB + d1 = vsx_i16x8_mul(d1, bit_shift); + d1 = // 8 in MSB + vsx_v128_and(d1, vsx_u32x4_const(OJPH_REPEAT4(0xFF00FF00))); + d0 = vsx_v128_or(d0, d1); + + // find location of e_k and mask + v128_t shift; + v128_t ones = vsx_i32x4_const(OJPH_REPEAT4(1)); + v128_t twos = vsx_i32x4_const(OJPH_REPEAT4(2)); + ui32 U_q_m1 = vsx_u32x4_extract_lane(U_q, 0) - 1u; + w0 = vsx_i32x4_sub(twos, w0); + shift = vsx_i32x4_shl(w0, U_q_m1); + ms_vec = vsx_v128_and(d0, vsx_i32x4_sub(shift, ones)); + + // next e_1 + w0 = vsx_v128_and(flags, vsx_i32x4_const(OJPH_REPEAT4(0x800))); + w0 = vsx_i32x4_eq(w0, vsx_i64x2_const(0, 0)); + w0 = vsx_v128_andnot(shift, w0); // e_1 in correct position + ms_vec = vsx_v128_or(ms_vec, w0); // e_1 + w0 = vsx_i32x4_shl(ms_vec, 31); // sign + ms_vec = vsx_v128_or(ms_vec, ones); // bin center + v128_t tvn = ms_vec; + ms_vec = vsx_i32x4_add(ms_vec, twos);// + 2 + ms_vec = vsx_i32x4_shl(ms_vec, p - 1); + ms_vec = vsx_v128_or(ms_vec, w0); // sign + row = vsx_v128_andnot(ms_vec, insig); // significant only + + ms_vec = vsx_v128_andnot(tvn, insig); // significant only + if (N == 0) // the compiler should remove one + tvn = vsx_i8x16_swizzle(ms_vec, + vsx_i32x4_const(0x07060504, 0x0F0E0D0C, -1, -1)); + else if (N == 1) + tvn = vsx_i8x16_swizzle(ms_vec, + vsx_i32x4_const(-1, 0x07060504, 0x0F0E0D0C, -1)); + else + assert(0); + vn = vsx_v128_or(vn, tvn); + + if (total_mn) + frwd_advance(magsgn, (ui32)total_mn); + } + return row; + } + + //************************************************************************/ + /** @brief decodes twos consecutive quads (one octet), using 16 bit data + * + * @param inf_u_q decoded VLC code, with interleaved u values + * @param U_q U values + * @param magsgn structure for forward data buffer + * @param p bitplane at which we are decoding + * @param vn used for handling E values (stores v_n values) + * @return v128_t decoded quad + */ + static inline + v128_t decode_two_quad16(const v128_t inf_u_q, v128_t U_q, + const ui8* dbuf, ui32 limit, ui32& pos, + ui32 p, v128_t& vn) + { + v128_t w0; // workers + v128_t insig; // lanes hold FF's if samples are insignificant + v128_t flags; // lanes hold e_k, e_1, and rho + v128_t row; // decoded row + + row = vsx_i64x2_const(0, 0); + w0 = vsx_i8x16_swizzle(inf_u_q, + vsx_i16x8_const(0x0100, 0x0100, 0x0100, 0x0100, + 0x0504, 0x0504, 0x0504, 0x0504)); + // we keeps e_k, e_1, and rho in w2 + flags = vsx_v128_and(w0, + vsx_u16x8_const(0x1110, 0x2220, 0x4440, 0x8880, + 0x1110, 0x2220, 0x4440, 0x8880)); + insig = vsx_i16x8_eq(flags, vsx_i64x2_const(0, 0)); + if (vsx_i8x16_bitmask(insig) != 0xFFFF) //are all insignificant? + { + U_q = vsx_i8x16_swizzle(U_q, + vsx_i16x8_const(0x0100, 0x0100, 0x0100, 0x0100, + 0x0504, 0x0504, 0x0504, 0x0504)); + flags = vsx_i16x8_mul(flags, vsx_i16x8_const(8,4,2,1,8,4,2,1)); + v128_t ms_vec = vsx_dfetch(dbuf, limit, pos); + + // U_q holds U_q for this quad + // flags has e_k, e_1, and rho such that e_k is sitting in the + // 0x8000, e_1 in 0x800, and rho in 0x80 + + // next e_k and m_n + v128_t m_n; + w0 = vsx_u16x8_shr(flags, 15); // e_k + m_n = vsx_i16x8_sub(U_q, w0); + m_n = vsx_v128_andnot(m_n, insig); + + // find cumulative sums + // to find at which bit in ms_vec the sample starts + v128_t ex_sum, shfl, inc_sum = m_n; // inclusive scan + shfl = vsx_i16x8_shuffle(vsx_i64x2_const(0,0), + inc_sum, 7, 8, 9, 10, 11, 12, 13, 14); + inc_sum = vsx_i16x8_add(inc_sum, shfl); + shfl = vsx_i32x4_shuffle(vsx_i64x2_const(0,0), inc_sum, 3, 4, 5, 6); + inc_sum = vsx_i16x8_add(inc_sum, shfl); + shfl = vsx_i64x2_shuffle(vsx_i64x2_const(0,0), inc_sum, 1, 2); + inc_sum = vsx_i16x8_add(inc_sum, shfl); + int total_mn = vsx_u16x8_extract_lane(inc_sum, 7); + ex_sum = vsx_i16x8_shuffle(vsx_i64x2_const(0,0), + inc_sum, 7, 8, 9, 10, 11, 12, 13, 14); + + // find the starting byte and starting bit + v128_t byte_idx = vsx_u16x8_shr(ex_sum, 3); + v128_t bit_idx = + vsx_v128_and(ex_sum, vsx_i16x8_const(OJPH_REPEAT8(7))); + byte_idx = vsx_i8x16_swizzle(byte_idx, + vsx_i16x8_const(0x0000, 0x0202, 0x0404, 0x0606, + 0x0808, 0x0A0A, 0x0C0C, 0x0E0E)); + byte_idx = + vsx_i16x8_add(byte_idx, vsx_i16x8_const(OJPH_REPEAT8(0x0100))); + v128_t d0 = vsx_i8x16_swizzle(ms_vec, byte_idx); + byte_idx = + vsx_i16x8_add(byte_idx, vsx_i16x8_const(OJPH_REPEAT8(0x0101))); + v128_t d1 = vsx_i8x16_swizzle(ms_vec, byte_idx); + + // shift samples values to correct location + v128_t bit_shift = vsx_i8x16_swizzle( + vsx_i8x16_const(-1, 127, 63, 31, 15, 7, 3, 1, + -1, 127, 63, 31, 15, 7, 3, 1), bit_idx); + bit_shift = + vsx_i16x8_add(bit_shift, vsx_i16x8_const(OJPH_REPEAT8(0x0101))); + d0 = vsx_i16x8_mul(d0, bit_shift); + d0 = vsx_u16x8_shr(d0, 8); // we should have 8 bits in the LSB + d1 = vsx_i16x8_mul(d1, bit_shift); + d1 = // 8 in MSB + vsx_v128_and(d1, vsx_i16x8_const(OJPH_REPEAT8((si16)0xFF00))); + d0 = vsx_v128_or(d0, d1); + + // find location of e_k and mask + v128_t shift, t0, t1; + v128_t ones = vsx_i16x8_const(OJPH_REPEAT8(1)); + v128_t twos = vsx_i16x8_const(OJPH_REPEAT8(2)); + v128_t U_q_m1 = vsx_i32x4_sub(U_q, ones); + ui32 Uq0 = vsx_u16x8_extract_lane(U_q_m1, 0); + ui32 Uq1 = vsx_u16x8_extract_lane(U_q_m1, 4); + w0 = vsx_i16x8_sub(twos, w0); + t0 = vsx_v128_and(w0, vsx_i64x2_const(-1, 0)); + t1 = vsx_v128_and(w0, vsx_i64x2_const(0, -1)); + t0 = vsx_i32x4_shl(t0, Uq0); + t1 = vsx_i32x4_shl(t1, Uq1); + shift = vsx_v128_or(t0, t1); + ms_vec = vsx_v128_and(d0, vsx_i16x8_sub(shift, ones)); + + // next e_1 + w0 = vsx_v128_and(flags, vsx_i16x8_const(OJPH_REPEAT8(0x800))); + w0 = vsx_i16x8_eq(w0, vsx_i64x2_const(0, 0)); + w0 = vsx_v128_andnot(shift, w0); // e_1 in correct position + ms_vec = vsx_v128_or(ms_vec, w0); // e_1 + w0 = vsx_i16x8_shl(ms_vec, 15); // sign + ms_vec = vsx_v128_or(ms_vec, ones); // bin center + v128_t tvn = ms_vec; + ms_vec = vsx_i16x8_add(ms_vec, twos);// + 2 + ms_vec = vsx_i16x8_shl(ms_vec, p - 1); + ms_vec = vsx_v128_or(ms_vec, w0); // sign + row = vsx_v128_andnot(ms_vec, insig); // significant only + + ms_vec = vsx_v128_andnot(tvn, insig); // significant only + w0 = vsx_i8x16_swizzle(ms_vec, + vsx_i16x8_const(0x0302, 0x0706, -1, -1, -1, -1, -1, -1)); + vn = vsx_v128_or(vn, w0); + w0 = vsx_i8x16_swizzle(ms_vec, + vsx_i16x8_const(-1, 0x0B0A, 0x0F0E, -1, -1, -1, -1, -1)); + vn = vsx_v128_or(vn, w0); + + pos += (ui32)total_mn; + } + return row; + } + + + //************************************************************************/ + /** @brief Decodes one codeblock, processing the cleanup, siginificance + * propagation, and magnitude refinement pass + * + * @param [in] coded_data is a pointer to bitstream + * @param [in] decoded_data is a pointer to decoded codeblock data buf. + * @param [in] missing_msbs is the number of missing MSBs + * @param [in] num_passes is the number of passes: 1 if CUP only, + * 2 for CUP+SPP, and 3 for CUP+SPP+MRP + * @param [in] lengths1 is the length of cleanup pass + * @param [in] lengths2 is the length of refinement passes (either SPP + * only or SPP+MRP) + * @param [in] width is the decoded codeblock width + * @param [in] height is the decoded codeblock height + * @param [in] stride is the decoded codeblock buffer stride + * @param [in] stripe_causal is true for stripe causal mode + */ + bool ojph_decode_codeblock_vsx(ui8* coded_data, ui32* decoded_data, + ui32 missing_msbs, ui32 num_passes, + ui32 lengths1, ui32 lengths2, + ui32 width, ui32 height, ui32 stride, + bool stripe_causal) + { + static bool insufficient_precision = false; + static bool modify_code = false; + static bool truncate_spp_mrp = false; + + if (num_passes > 1 && lengths2 == 0) + { + OJPH_WARN(0x00010001, "A malformed codeblock that has more than " + "one coding pass, but zero length for " + "2nd and potential 3rd pass.\n"); + num_passes = 1; + } + + if (num_passes > 3) + { + OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; " + "This codeblocks has %d passes.\n", + num_passes); + return false; + } + + if (missing_msbs > 30) // p < 0 + { + if (insufficient_precision == false) + { + insufficient_precision = true; + OJPH_WARN(0x00010003, "32 bits are not enough to decode this " + "codeblock. This message will not be " + "displayed again.\n"); + } + return false; + } + else if (missing_msbs == 30) // p == 0 + { // not enough precision to decode and set the bin center to 1 + if (modify_code == false) { + modify_code = true; + OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup " + "pass. The code can be modified to support " + "this case. This message will not be " + "displayed again.\n"); + } + return false; // 32 bits are not enough to decode this + } + else if (missing_msbs == 29) // if p is 1, then num_passes must be 1 + { + if (num_passes > 1) { + num_passes = 1; + if (truncate_spp_mrp == false) { + truncate_spp_mrp = true; + OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp " + "nor MagRef passes; both will be skipped. " + "This message will not be displayed " + "again.\n"); + } + } + } + ui32 p = 30 - missing_msbs; // The least significant bitplane for CUP + // There is a way to handle the case of p == 0, but a different path + // is required + + if (lengths1 < 2) + { + OJPH_WARN(0x00010006, "Wrong codeblock length.\n"); + return false; + } + + // read scup and fix the bytes there + int lcup, scup; + lcup = (int)lengths1; // length of CUP + //scup is the length of MEL + VLC + scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF); + if (scup < 2 || scup > lcup || scup > 4079) //something is wrong + return false; + + // The temporary storage scratch holds two types of data in an + // interleaved fashion. The interleaving allows us to use one + // memory pointer. + // We have one entry for a decoded VLC code, and one entry for UVLC. + // Entries are 16 bits each, corresponding to one quad, + // but since we want to use XMM registers of the SSE family + // of SIMD; we allocated 16 bytes or more per quad row; that is, + // the width is no smaller than 16 bytes (or 8 entries), and the + // height is 512 quads + // Each VLC entry contains, in the following order, starting + // from MSB + // e_k (4bits), e_1 (4bits), rho (4bits), useless for step 2 (4bits) + // Each entry in UVLC contains u_q + // One extra row to handle the case of SPP propagating downwards + // when codeblock width is 4 + ui16 scratch[8 * 513] = {0}; // 8+ kB + + // We need an extra two entries (one inf and one u_q) beyond + // the last column. + // If the block width is 4 (2 quads), then we use sstr of 8 + // (enough for 4 quads). If width is 8 (4 quads) we use + // sstr is 16 (enough for 8 quads). For a width of 16 (8 + // quads), we use 24 (enough for 12 quads). + ui32 sstr = ((width + 2u) + 7u) & ~7u; // multiples of 8 + + assert((stride & 0x3) == 0); + + ui32 mmsbp2 = missing_msbs + 2; + + // The cleanup pass is decoded in two steps; in step one, + // the VLC and MEL segments are decoded, generating a record that + // has 2 bytes per quad. The 2 bytes contain, u, rho, e^1 & e^k. + // This information should be sufficient for the next step. + // In step 2, we decode the MagSgn segment. + + // step 1 decoding VLC and MEL segments + { + // init structures + dec_mel_st mel; + mel_init(&mel, coded_data, lcup, scup); + rev_struct vlc; + rev_init(&vlc, coded_data, lcup, scup); + + int run = mel_get_run(&mel); // decode runs of events from MEL bitstrm + // data represented as runs of 0 events + // See mel_decode description + + ui32 vlc_val; + ui32 c_q = 0; + ui16 *sp = scratch; + //initial quad row + for (ui32 x = 0; x < width; sp += 4) + { + // decode VLC + ///////////// + + // first quad + vlc_val = rev_fetch(&vlc); + + //decode VLC using the context c_q and the head of VLC bitstream + ui16 t0 = vlc_tbl0[ c_q + (vlc_val & 0x7F) ]; + + // if context is zero, use one MEL event + if (c_q == 0) //zero context + { + run -= 2; //subtract 2, since events number if multiplied by 2 + + // Is the run terminated in 1? if so, use decoded VLC code, + // otherwise, discard decoded data, since we will decoded again + // using a different context + t0 = (run == -1) ? t0 : 0; + + // is run -1 or -2? this means a run has been consumed + if (run < 0) + run = mel_get_run(&mel); // get another run + } + //run -= (c_q == 0) ? 2 : 0; + //t0 = (c_q != 0 || run == -1) ? t0 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + sp[0] = t0; + x += 2; + + // prepare context for the next quad; eqn. 1 in ITU T.814 + c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2); + + //remove data from vlc stream (0 bits are removed if vlc is not used) + vlc_val = rev_advance(&vlc, t0 & 0x7); + + //second quad + ui16 t1 = 0; + + //decode VLC using the context c_q and the head of VLC bitstream + t1 = vlc_tbl0[c_q + (vlc_val & 0x7F)]; + + // if context is zero, use one MEL event + if (c_q == 0 && x < width) //zero context + { + run -= 2; //subtract 2, since events number if multiplied by 2 + + // if event is 0, discard decoded t1 + t1 = (run == -1) ? t1 : 0; + + if (run < 0) // have we consumed all events in a run + run = mel_get_run(&mel); // if yes, then get another run + } + t1 = x < width ? t1 : 0; + //run -= (c_q == 0 && x < width) ? 2 : 0; + //t1 = (c_q != 0 || run == -1) ? t1 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + sp[2] = t1; + x += 2; + + //prepare context for the next quad, eqn. 1 in ITU T.814 + c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2); + + //remove data from vlc stream, if qinf is not used, cwdlen is 0 + vlc_val = rev_advance(&vlc, t1 & 0x7); + + // decode u + ///////////// + // uvlc_mode is made up of u_offset bits from the quad pair + ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4); + if (uvlc_mode == 0xc0)// if both u_offset are set, get an event from + { // the MEL run of events + run -= 2; //subtract 2, since events number if multiplied by 2 + + uvlc_mode += (run == -1) ? 0x40 : 0; // increment uvlc_mode by + // is 0x40 + + if (run < 0)//if run is consumed (run is -1 or -2), get another run + run = mel_get_run(&mel); + } + //run -= (uvlc_mode == 0xc0) ? 2 : 0; + //uvlc_mode += (uvlc_mode == 0xc0 && run == -1) ? 0x40 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + + //decode uvlc_mode to get u for both quads + ui32 uvlc_entry = uvlc_tbl0[uvlc_mode + (vlc_val & 0x3F)]; + //remove total prefix length + vlc_val = rev_advance(&vlc, uvlc_entry & 0x7); + uvlc_entry >>= 3; + //extract suffixes for quad 0 and 1 + ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads + ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads + vlc_val = rev_advance(&vlc, len); + uvlc_entry >>= 4; + // quad 0 length + len = uvlc_entry & 0x7; // quad 0 suffix length + uvlc_entry >>= 3; + ui16 u_q = (ui16)(1 + (uvlc_entry&7) + (tmp&~(0xFFU<> 3) + (tmp >> len)); //kappa == 1 + sp[3] = u_q; + } + sp[0] = sp[1] = 0; + + //non initial quad rows + for (ui32 y = 2; y < height; y += 2) + { + c_q = 0; // context + ui16 *sp = scratch + (y >> 1) * sstr; // this row of quads + + for (ui32 x = 0; x < width; sp += 4) + { + // decode VLC + ///////////// + + // sigma_q (n, ne, nf) + c_q |= ((sp[0 - (si32)sstr] & 0xA0U) << 2); + c_q |= ((sp[2 - (si32)sstr] & 0x20U) << 4); + + // first quad + vlc_val = rev_fetch(&vlc); + + //decode VLC using the context c_q and the head of VLC bitstream + ui16 t0 = vlc_tbl1[ c_q + (vlc_val & 0x7F) ]; + + // if context is zero, use one MEL event + if (c_q == 0) //zero context + { + run -= 2; //subtract 2, since events number is multiplied by 2 + + // Is the run terminated in 1? if so, use decoded VLC code, + // otherwise, discard decoded data, since we will decoded again + // using a different context + t0 = (run == -1) ? t0 : 0; + + // is run -1 or -2? this means a run has been consumed + if (run < 0) + run = mel_get_run(&mel); // get another run + } + //run -= (c_q == 0) ? 2 : 0; + //t0 = (c_q != 0 || run == -1) ? t0 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + sp[0] = t0; + x += 2; + + // prepare context for the next quad; eqn. 2 in ITU T.814 + // sigma_q (w, sw) + c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1); + // sigma_q (nw) + c_q |= sp[0 - (si32)sstr] & 0x80; + // sigma_q (n, ne, nf) + c_q |= ((sp[2 - (si32)sstr] & 0xA0U) << 2); + c_q |= ((sp[4 - (si32)sstr] & 0x20U) << 4); + + //remove data from vlc stream (0 bits are removed if vlc is unused) + vlc_val = rev_advance(&vlc, t0 & 0x7); + + //second quad + ui16 t1 = 0; + + //decode VLC using the context c_q and the head of VLC bitstream + t1 = vlc_tbl1[ c_q + (vlc_val & 0x7F)]; + + // if context is zero, use one MEL event + if (c_q == 0 && x < width) //zero context + { + run -= 2; //subtract 2, since events number if multiplied by 2 + + // if event is 0, discard decoded t1 + t1 = (run == -1) ? t1 : 0; + + if (run < 0) // have we consumed all events in a run + run = mel_get_run(&mel); // if yes, then get another run + } + t1 = x < width ? t1 : 0; + //run -= (c_q == 0 && x < width) ? 2 : 0; + //t1 = (c_q != 0 || run == -1) ? t1 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + sp[2] = t1; + x += 2; + + // partial c_q, will be completed when we process the next quad + // sigma_q (w, sw) + c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1); + // sigma_q (nw) + c_q |= sp[2 - (si32)sstr] & 0x80; + + //remove data from vlc stream, if qinf is not used, cwdlen is 0 + vlc_val = rev_advance(&vlc, t1 & 0x7); + + // decode u + ///////////// + // uvlc_mode is made up of u_offset bits from the quad pair + ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4); + ui32 uvlc_entry = uvlc_tbl1[uvlc_mode + (vlc_val & 0x3F)]; + //remove total prefix length + vlc_val = rev_advance(&vlc, uvlc_entry & 0x7); + uvlc_entry >>= 3; + //extract suffixes for quad 0 and 1 + ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads + ui32 tmp = vlc_val & ((1 << len) - 1); //suffix value for 2 quads + vlc_val = rev_advance(&vlc, len); + uvlc_entry >>= 4; + // quad 0 length + len = uvlc_entry & 0x7; // quad 0 suffix length + uvlc_entry >>= 3; + ui16 u_q = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFU << len))); //u_q + sp[1] = u_q; + u_q = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q + sp[3] = u_q; + } + sp[0] = sp[1] = 0; + } + } + + // step2 we decode magsgn + // mmsbp2 equals K_max + 1 (we decode up to K_max bits + 1 sign bit) + // The 32 bit path decode 16 bits data, for which one would think + // 16 bits are enough, because we want to put in the center of the + // bin. + // If you have mmsbp2 equals 16 bit, and reversible coding, and + // no bitplanes are missing, then we can decoding using the 16 bit + // path, but we are not doing this here. + if (mmsbp2 >= 16) + { + // We allocate a scratch row for storing v_n values. + // We have 512 quads horizontally. + // We may go beyond the last entry by up to 4 entries. + // Here we allocate additional 8 entries. + // There are two rows in this structure, the bottom + // row is used to store processed entries. + const int v_n_size = 512 + 8; + ui32 v_n_scratch[2 * v_n_size] = {0}; // 4+ kB + + frwd_struct magsgn; + frwd_init<0xFF>(&magsgn, coded_data, lcup - scup); + + { + ui16 *sp = scratch; + ui32 *vp = v_n_scratch; + ui32 *dp = decoded_data; + vp[0] = 2; // for easy calculation of emax + + for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4) + { + //here we process two quads + v128_t w0, w1; // workers + v128_t inf_u_q, U_q; + // determine U_q + { + inf_u_q = vsx_v128_load(sp); + U_q = vsx_u32x4_shr(inf_u_q, 16); + + w0 = vsx_i32x4_gt(U_q, vsx_u32x4_splat(mmsbp2)); + ui32 i = vsx_i8x16_bitmask(w0); + if (i & 0xFF) // only the lower two U_q + return false; + } + + v128_t vn = vsx_i32x4_const(OJPH_REPEAT4(2)); + v128_t row0 = decode_one_quad32<0>(inf_u_q, U_q, &magsgn, p, vn); + v128_t row1 = decode_one_quad32<1>(inf_u_q, U_q, &magsgn, p, vn); + w0 = vsx_v128_load(vp); + w0 = vsx_v128_and(w0, vsx_i32x4_const(-1,0,0,0)); + w0 = vsx_v128_or(w0, vn); + vsx_v128_store(vp, w0); + + //interleave in ssse3 style + + w0 = vsx_i32x4_shuffle(row0, row1, 0, 4, 1, 5); + w1 = vsx_i32x4_shuffle(row0, row1, 2, 6, 3, 7); + row0 = vsx_i32x4_shuffle(w0, w1, 0, 4, 1, 5); + row1 = vsx_i32x4_shuffle(w0, w1, 2, 6, 3, 7); + vsx_v128_store(dp, row0); + vsx_v128_store(dp + stride, row1); + } + } + + for (ui32 y = 2; y < height; y += 2) + { + { + // perform 31 - count_leading_zeros(*vp) here + ui32 *vp = v_n_scratch; + const v128_t lut_lo = vsx_i8x16_const( + 31, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4 + ); + const v128_t lut_hi = vsx_i8x16_const( + 31, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 + ); + const v128_t nibble_mask = vsx_i8x16_const(OJPH_REPEAT16(0x0F)); + const v128_t byte_offset8 = vsx_i16x8_const(OJPH_REPEAT8(8)); + const v128_t byte_offset16 = vsx_i16x8_const(OJPH_REPEAT8(16)); + const v128_t cc = vsx_i32x4_const(OJPH_REPEAT4(31)); + for (ui32 x = 0; x <= width; x += 8, vp += 4) + { + v128_t v, t; // workers + v = vsx_v128_load(vp); + + t = vsx_v128_and(nibble_mask, v); + v = vsx_v128_and(vsx_u16x8_shr(v, 4), nibble_mask); + t = vsx_i8x16_swizzle(lut_lo, t); + v = vsx_i8x16_swizzle(lut_hi, v); + v = vsx_u8x16_min(v, t); + + t = vsx_u16x8_shr(v, 8); + v = vsx_v128_or(v, byte_offset8); + v = vsx_u8x16_min(v, t); + + t = vsx_u32x4_shr(v, 16); + v = vsx_v128_or(v, byte_offset16); + v = vsx_u8x16_min(v, t); + + v = vsx_i16x8_sub(cc, v); + vsx_v128_store(vp + v_n_size, v); + } + } + + ui32 *vp = v_n_scratch; + ui16 *sp = scratch + (y >> 1) * sstr; + ui32 *dp = decoded_data + y * stride; + vp[0] = 2; // for easy calculation of emax + + for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4) + { + //process two quads + v128_t w0, w1; // workers + v128_t inf_u_q, U_q; + // determine U_q + { + v128_t gamma, emax, kappa, u_q; // needed locally + + inf_u_q = vsx_v128_load(sp); + gamma = + vsx_v128_and(inf_u_q, vsx_i32x4_const(OJPH_REPEAT4(0xF0))); + w0 = vsx_i32x4_sub(gamma, vsx_i32x4_const(OJPH_REPEAT4(1))); + gamma = vsx_v128_and(gamma, w0); + gamma = vsx_i32x4_eq(gamma, vsx_i64x2_const(0, 0)); + + emax = vsx_v128_load(vp + v_n_size); + w0 = vsx_i32x4_shuffle(emax, vsx_i64x2_const(0,0), 1, 2, 3, 4); + emax = vsx_i16x8_max(w0, emax); // no max_epi32 in ssse3 + emax = vsx_v128_andnot(emax, gamma); + + kappa = vsx_i32x4_const(OJPH_REPEAT4(1)); + kappa = vsx_i16x8_max(emax, kappa); // no max_epi32 in ssse3 + + u_q = vsx_u32x4_shr(inf_u_q, 16); + U_q = vsx_i32x4_add(u_q, kappa); + + w0 = vsx_i32x4_gt(U_q, vsx_u32x4_splat(mmsbp2)); + ui32 i = vsx_i8x16_bitmask(w0); + if (i & 0xFF) // only the lower two U_q + return false; + } + + v128_t vn = vsx_i32x4_const(OJPH_REPEAT4(2)); + v128_t row0 = decode_one_quad32<0>(inf_u_q, U_q, &magsgn, p, vn); + v128_t row1 = decode_one_quad32<1>(inf_u_q, U_q, &magsgn, p, vn); + w0 = vsx_v128_load(vp); + w0 = vsx_v128_and(w0, vsx_i32x4_const(-1,0,0,0)); + w0 = vsx_v128_or(w0, vn); + vsx_v128_store(vp, w0); + + //interleave in ssse3 style + w0 = vsx_i32x4_shuffle(row0, row1, 0, 4, 1, 5); + w1 = vsx_i32x4_shuffle(row0, row1, 2, 6, 3, 7); + row0 = vsx_i32x4_shuffle(w0, w1, 0, 4, 1, 5); + row1 = vsx_i32x4_shuffle(w0, w1, 2, 6, 3, 7); + vsx_v128_store(dp, row0); + vsx_v128_store(dp + stride, row1); + } + } + } + else + { + // reduce bitplane by 16 because we now have 16 bits instead of 32 + p -= 16; + + // We allocate a scratch row for storing v_n values. + // We have 512 quads horizontally. + // We may go beyond the last entry by up to 8 entries. + // Therefore we allocate additional 8 entries. + // There are two rows in this structure, the bottom + // row is used to store processed entries. + const int v_n_size = 512 + 8; + ui16 v_n_scratch[2 * v_n_size] = {0}; // 2+ kB + + // destuff the MagSgn bitstream upfront; per-quad consumption then + // advances a bit position in a GPR (see destuff_frwd) + const ui32 dbuf_cap = 4096 * 15 / 8; + ui8 dbuf[dbuf_cap + 72]; + ui32 limit = destuff_frwd<0xFF>(coded_data, lcup - scup, + dbuf, dbuf_cap); + ui32 pos = 0; + + { + ui16 *sp = scratch; + ui16 *vp = v_n_scratch; + ui32 *dp = decoded_data; + vp[0] = 2; // for easy calculation of emax + + for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4) + { + //here we process two quads + v128_t w0, w1; // workers + v128_t inf_u_q, U_q; + // determine U_q + { + inf_u_q = vsx_v128_load(sp); + U_q = vsx_u32x4_shr(inf_u_q, 16); + + w0 = vsx_i32x4_gt(U_q, vsx_u32x4_splat(mmsbp2)); + ui32 i = vsx_i8x16_bitmask(w0); + if (i & 0xFF) // only the lower two U_q + return false; + } + + v128_t vn = vsx_i16x8_const(OJPH_REPEAT8(2)); + v128_t row = decode_two_quad16(inf_u_q, U_q, dbuf, limit, pos, p, vn); + w0 = vsx_v128_load(vp); + w0 = vsx_v128_and(w0, vsx_i16x8_const(-1,0,0,0,0,0,0,0)); + w0 = vsx_v128_or(w0, vn); + vsx_v128_store(vp, w0); + + //interleave in ssse3 style + w0 = vsx_i8x16_swizzle(row, + vsx_i16x8_const(-1, 0x0100, -1, 0x0504, + -1, 0x0908, -1, 0x0D0C)); + vsx_v128_store(dp, w0); + w1 = vsx_i8x16_swizzle(row, + vsx_i16x8_const(-1, 0x0302, -1, 0x0706, + -1, 0x0B0A, -1, 0x0F0E)); + vsx_v128_store(dp + stride, w1); + } + } + + for (ui32 y = 2; y < height; y += 2) + { + { + // perform 15 - count_leading_zeros(*vp) here + ui16 *vp = v_n_scratch; + const v128_t lut_lo = vsx_i8x16_const( + 15, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4 + ); + const v128_t lut_hi = vsx_i8x16_const( + 15, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 + ); + const v128_t nibble_mask = vsx_i8x16_const(OJPH_REPEAT16(0x0F)); + const v128_t byte_offset8 = vsx_i16x8_const(OJPH_REPEAT8(8)); + const v128_t cc = vsx_i16x8_const(OJPH_REPEAT8(15)); + for (ui32 x = 0; x <= width; x += 16, vp += 8) + { + v128_t v, t; // workers + v = vsx_v128_load(vp); + + t = vsx_v128_and(nibble_mask, v); + v = vsx_v128_and(vsx_u16x8_shr(v, 4), nibble_mask); + t = vsx_i8x16_swizzle(lut_lo, t); + v = vsx_i8x16_swizzle(lut_hi, v); + v = vsx_u8x16_min(v, t); + + t = vsx_u16x8_shr(v, 8); + v = vsx_v128_or(v, byte_offset8); + v = vsx_u8x16_min(v, t); + + v = vsx_i16x8_sub(cc, v); + vsx_v128_store(vp + v_n_size, v); + } + } + + ui16 *vp = v_n_scratch; + ui16 *sp = scratch + (y >> 1) * sstr; + ui32 *dp = decoded_data + y * stride; + vp[0] = 2; // for easy calculation of emax + + for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4) + { + //process two quads + v128_t w0, w1; // workers + v128_t inf_u_q, U_q; + // determine U_q + { + v128_t gamma, emax, kappa, u_q; // needed locally + + inf_u_q = vsx_v128_load(sp); + gamma = + vsx_v128_and(inf_u_q, vsx_i32x4_const(OJPH_REPEAT4(0xF0))); + w0 = vsx_i32x4_sub(gamma, vsx_i32x4_const(OJPH_REPEAT4(1))); + gamma = vsx_v128_and(gamma, w0); + gamma = vsx_i32x4_eq(gamma, vsx_i64x2_const(0, 0)); + + emax = vsx_v128_load(vp + v_n_size); + w0 = vsx_i16x8_shuffle(emax, + vsx_i64x2_const(0, 0), 1, 2, 3, 4, 5, 6, 7, 8); + emax = vsx_i16x8_max(w0, emax); // no max_epi32 in ssse3 + emax = vsx_i8x16_swizzle(emax, + vsx_i16x8_const(0x0100, -1, 0x0302, -1, + 0x0504, -1, 0x0706, -1)); + emax = vsx_v128_andnot(emax, gamma); + + kappa = vsx_i32x4_const(OJPH_REPEAT4(1)); + kappa = vsx_i16x8_max(emax, kappa); // no max_epi32 in ssse3 + + u_q = vsx_u32x4_shr(inf_u_q, 16); + U_q = vsx_i32x4_add(u_q, kappa); + + w0 = vsx_i32x4_gt(U_q, vsx_u32x4_splat(mmsbp2)); + ui32 i = vsx_i8x16_bitmask(w0); + if (i & 0xFF) // only the lower two U_q + return false; + } + + v128_t vn = vsx_i16x8_const(OJPH_REPEAT8(2)); + v128_t row = decode_two_quad16(inf_u_q, U_q, dbuf, limit, pos, p, vn); + w0 = vsx_v128_load(vp); + w0 = vsx_v128_and(w0, vsx_i16x8_const(-1,0,0,0,0,0,0,0)); + w0 = vsx_v128_or(w0, vn); + vsx_v128_store(vp, w0); + + w0 = vsx_i8x16_swizzle(row, + vsx_i16x8_const(-1, 0x0100, -1, 0x0504, + -1, 0x0908, -1, 0x0D0C)); + vsx_v128_store(dp, w0); + w1 = vsx_i8x16_swizzle(row, + vsx_i16x8_const(-1, 0x0302, -1, 0x0706, + -1, 0x0B0A, -1, 0x0F0E)); + vsx_v128_store(dp + stride, w1); + } + } + + // increase bitplane back by 16 because we need to process 32 bits + p += 16; + } + + if (num_passes > 1) + { + // We use scratch again, we can divide it into multiple regions + // sigma holds all the significant samples, and it cannot + // be modified after it is set. it will be used during the + // Magnitude Refinement Pass + ui16* const sigma = scratch; + + ui32 mstr = (width + 3u) >> 2; // divide by 4, since each + // ui16 contains 4 columns + mstr = ((mstr + 2u) + 7u) & ~7u; // multiples of 8 + + // We re-arrange quad significance, where each 4 consecutive + // bits represent one quad, into column significance, where, + // each 4 consequtive bits represent one column of 4 rows + { + ui32 y; + + const v128_t mask_3 = vsx_i32x4_const(OJPH_REPEAT4(0x30)); + const v128_t mask_C = vsx_i32x4_const(OJPH_REPEAT4(0xC0)); + const v128_t shuffle_mask = vsx_i32x4_const(0x0C080400,-1,-1,-1); + for (y = 0; y < height; y += 4) + { + ui16* sp = scratch + (y >> 1) * sstr; + ui16* dp = sigma + (y >> 2) * mstr; + for (ui32 x = 0; x < width; x += 8, sp += 8, dp += 2) + { + v128_t s0, s1, u3, uC, t0, t1; + + s0 = vsx_v128_load(sp); + u3 = vsx_v128_and(s0, mask_3); + u3 = vsx_u32x4_shr(u3, 4); + uC = vsx_v128_and(s0, mask_C); + uC = vsx_u32x4_shr(uC, 2); + t0 = vsx_v128_or(u3, uC); + + s1 = vsx_v128_load(sp + sstr); + u3 = vsx_v128_and(s1, mask_3); + u3 = vsx_u32x4_shr(u3, 2); + uC = vsx_v128_and(s1, mask_C); + t1 = vsx_v128_or(u3, uC); + + v128_t r = vsx_v128_or(t0, t1); + r = vsx_i8x16_swizzle(r, shuffle_mask); + + vsx_v128_store32_lane(dp, r, 0); + } + dp[0] = 0; // set an extra entry on the right with 0 + } + { + // reset one row after the codeblock + ui16* dp = sigma + (y >> 2) * mstr; + v128_t zero = vsx_i64x2_const(0, 0); + for (ui32 x = 0; x < width; x += 32, dp += 8) + vsx_v128_store(dp, zero); + dp[0] = 0; // set an extra entry on the right with 0 + } + } + + // We perform Significance Propagation Pass here + { + // This stores significance information of the previous + // 4 rows. Significance information in this array includes + // all signicant samples in bitplane p - 1; that is, + // significant samples for bitplane p (discovered during the + // cleanup pass and stored in sigma) and samples that have recently + // became significant (during the SPP) in bitplane p-1. + // We store enough for the widest row, containing 1024 columns, + // which is equivalent to 256 of ui16, since each stores 4 columns. + // We add an extra 8 entries, just in case we need more + ui16 prev_row_sig[256 + 8] = {0}; // 528 Bytes + + frwd_struct sigprop; + frwd_init<0>(&sigprop, coded_data + lengths1, (int)lengths2); + + for (ui32 y = 0; y < height; y += 4) + { + ui32 pattern = 0xFFFFu; // a pattern needed samples + if (height - y < 4) { + pattern = 0x7777u; + if (height - y < 3) { + pattern = 0x3333u; + if (height - y < 2) + pattern = 0x1111u; + } + } + + // prev holds sign. info. for the previous quad, together + // with the rows on top of it and below it. + ui32 prev = 0; + ui16 *prev_sig = prev_row_sig; + ui16 *cur_sig = sigma + (y >> 2) * mstr; + ui32 *dpp = decoded_data + y * stride; + for (ui32 x = 0; x < width; x += 4, dpp += 4, ++cur_sig, ++prev_sig) + { + // only rows and columns inside the stripe are included + si32 s = (si32)x + 4 - (si32)width; + s = ojph_max(s, 0); + pattern = pattern >> (s * 4); + + // We first find locations that need to be tested (potential + // SPP members); these location will end up in mbr + // In each iteration, we produce 16 bits because cwd can have + // up to 16 bits of significance information, followed by the + // corresponding 16 bits of sign information; therefore, it is + // sufficient to fetch 32 bit data per loop. + + // Althougth we are interested in 16 bits only, we load 32 bits. + // For the 16 bits we are producing, we need the next 4 bits -- + // We need data for at least 5 columns out of 8. + // Therefore loading 32 bits is easier than loading 16 bits + // twice. + ui32 ps; memcpy(&ps, prev_sig, sizeof(ps)); + ui32 ns; memcpy(&ns, cur_sig + mstr, sizeof(ns)); + ui32 u = (ps & 0x88888888) >> 3; // the row on top + if (!stripe_causal) + u |= (ns & 0x11111111) << 3; // the row below + + ui32 cs; memcpy(&cs, cur_sig, sizeof(cs)); + // vertical integration + ui32 mbr = cs; // this sig. info. + mbr |= (cs & 0x77777777) << 1; //above neighbors + mbr |= (cs & 0xEEEEEEEE) >> 1; //below neighbors + mbr |= u; + // horizontal integration + ui32 t = mbr; + mbr |= t << 4; // neighbors on the left + mbr |= t >> 4; // neighbors on the right + mbr |= prev >> 12; // significance of previous group + + // remove outside samples, and already significant samples + mbr &= pattern; + mbr &= ~cs; + + // find samples that become significant during the SPP + ui32 new_sig = mbr; + if (new_sig) + { + v128_t cwd_vec = frwd_fetch<0>(&sigprop); + ui32 cwd = vsx_u32x4_extract_lane(cwd_vec, 0); + + ui32 cnt = 0; + ui32 col_mask = 0xFu; + ui32 inv_sig = ~cs & pattern; + for (int i = 0; i < 16; i += 4, col_mask <<= 4) + { + if ((col_mask & new_sig) == 0) + continue; + + //scan one column + ui32 sample_mask = 0x1111u & col_mask; + if (new_sig & sample_mask) + { + new_sig &= ~sample_mask; + if (cwd & 1) + { + ui32 t = 0x33u << i; + new_sig |= t & inv_sig; + } + cwd >>= 1; ++cnt; + } + + sample_mask <<= 1; + if (new_sig & sample_mask) + { + new_sig &= ~sample_mask; + if (cwd & 1) + { + ui32 t = 0x76u << i; + new_sig |= t & inv_sig; + } + cwd >>= 1; ++cnt; + } + + sample_mask <<= 1; + if (new_sig & sample_mask) + { + new_sig &= ~sample_mask; + if (cwd & 1) + { + ui32 t = 0xECu << i; + new_sig |= t & inv_sig; + } + cwd >>= 1; ++cnt; + } + + sample_mask <<= 1; + if (new_sig & sample_mask) + { + new_sig &= ~sample_mask; + if (cwd & 1) + { + ui32 t = 0xC8u << i; + new_sig |= t & inv_sig; + } + cwd >>= 1; ++cnt; + } + } + + if (new_sig) + { + // Spread new_sig, such that each bit is in one byte with a + // value of 0 if new_sig bit is 0, and 0xFF if new_sig is 1 + v128_t new_sig_vec = vsx_i16x8_splat((si16)new_sig); + new_sig_vec = vsx_i8x16_swizzle(new_sig_vec, + vsx_i8x16_const(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1)); + new_sig_vec = vsx_v128_and(new_sig_vec, + vsx_u64x2_const(OJPH_REPEAT2(0x8040201008040201))); + new_sig_vec = vsx_i8x16_eq(new_sig_vec, + vsx_u64x2_const(OJPH_REPEAT2(0x8040201008040201))); + + // find cumulative sums + // to find which bit in cwd we should extract + v128_t ex_sum, shfl, inc_sum = new_sig_vec; // inclusive scan + inc_sum = vsx_i8x16_abs(inc_sum); // cvrt to 0 or 1 + shfl = vsx_i8x16_shuffle(vsx_i64x2_const(0,0), inc_sum, + 15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30); + inc_sum = vsx_i8x16_add(inc_sum, shfl); + shfl = vsx_i16x8_shuffle(vsx_i64x2_const(0,0), inc_sum, + 7, 8, 9, 10, 11, 12, 13, 14); + inc_sum = vsx_i8x16_add(inc_sum, shfl); + shfl = vsx_i32x4_shuffle(vsx_i64x2_const(0,0), inc_sum, + 3, 4, 5, 6); + inc_sum = vsx_i8x16_add(inc_sum, shfl); + shfl = vsx_i64x2_shuffle(vsx_i64x2_const(0,0), inc_sum, + 1, 2); + inc_sum = vsx_i8x16_add(inc_sum, shfl); + cnt += vsx_u8x16_extract_lane(inc_sum, 15); + // exclusive scan + ex_sum = vsx_i8x16_shuffle(vsx_i64x2_const(0,0), inc_sum, + 15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30); + + // Spread cwd, such that each bit is in one byte + // with a value of 0 or 1. + cwd_vec = vsx_i16x8_splat((si16)cwd); + cwd_vec = vsx_i8x16_swizzle(cwd_vec, + vsx_i8x16_const(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1)); + cwd_vec = vsx_v128_and(cwd_vec, + vsx_u64x2_const(OJPH_REPEAT2(0x8040201008040201))); + cwd_vec = vsx_i8x16_eq(cwd_vec, + vsx_u64x2_const(OJPH_REPEAT2(0x8040201008040201))); + cwd_vec = vsx_i8x16_abs(cwd_vec); + + // Obtain bit from cwd_vec correspondig to ex_sum + // Basically, collect needed bits from cwd_vec + v128_t v = vsx_i8x16_swizzle(cwd_vec, ex_sum); + + // load data and set spp coefficients + v128_t m = vsx_i8x16_const( + 0,-1,-1,-1,4,-1,-1,-1,8,-1,-1,-1,12,-1,-1,-1); + v128_t val = vsx_i32x4_splat(3 << (p - 2)); + ui32 *dp = dpp; + for (int c = 0; c < 4; ++ c) { + v128_t s0, s0_ns, s0_val; + // load coefficients + s0 = vsx_v128_load(dp); + + // epi32 is -1 only for coefficient that + // are changed during the SPP + s0_ns = vsx_i8x16_swizzle(new_sig_vec, m); + s0_ns = vsx_i32x4_eq(s0_ns, + vsx_i32x4_const(OJPH_REPEAT4(0xFF))); + + // obtain sign for coefficients in SPP + s0_val = vsx_i8x16_swizzle(v, m); + s0_val = vsx_i32x4_shl(s0_val, 31); + s0_val = vsx_v128_or(s0_val, val); + s0_val = vsx_v128_and(s0_val, s0_ns); + + // update vector + s0 = vsx_v128_or(s0, s0_val); + // store coefficients + vsx_v128_store(dp, s0); + // prepare for next row + dp += stride; + m = vsx_i32x4_add(m, vsx_i32x4_const(OJPH_REPEAT4(1))); + } + } + frwd_advance(&sigprop, cnt); + } + + new_sig |= cs; + *prev_sig = (ui16)(new_sig); + + // vertical integration for the new sig. info. + t = new_sig; + new_sig |= (t & 0x7777) << 1; //above neighbors + new_sig |= (t & 0xEEEE) >> 1; //below neighbors + // add sig. info. from the row on top and below + prev = new_sig | u; + // we need only the bits in 0xF000 + prev &= 0xF000; + } + } + } + + // We perform Magnitude Refinement Pass here + if (num_passes > 2) + { + rev_struct magref; + rev_init_mrp(&magref, coded_data, (int)lengths1, (int)lengths2); + + for (ui32 y = 0; y < height; y += 4) + { + ui16 *cur_sig = sigma + (y >> 2) * mstr; + ui32 *dpp = decoded_data + y * stride; + for (ui32 i = 0; i < width; i += 4, dpp += 4) + { + //Process one entry from sigma array at a time + // Each nibble (4 bits) in the sigma array represents 4 rows, + ui32 cwd = rev_fetch_mrp(&magref); // get 32 bit data + ui16 sig = *cur_sig++; // 16 bit that will be processed now + int total_bits = 0; + if (sig) // if any of the 32 bits are set + { + // We work on 4 rows, with 4 samples each, since + // data is 32 bit (4 bytes) + + // spread the 16 bits in sig to 0 or 1 bytes in sig_vec + v128_t sig_vec = vsx_i16x8_splat((si16)sig); + sig_vec = vsx_i8x16_swizzle(sig_vec, + vsx_i8x16_const(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1)); + sig_vec = vsx_v128_and(sig_vec, + vsx_u64x2_const(OJPH_REPEAT2(0x8040201008040201))); + sig_vec = vsx_i8x16_eq(sig_vec, + vsx_u64x2_const(OJPH_REPEAT2(0x8040201008040201))); + sig_vec = vsx_i8x16_abs(sig_vec); + + // find cumulative sums + // to find which bit in cwd we should extract + v128_t ex_sum, shfl, inc_sum = sig_vec; // inclusive scan + shfl = vsx_i8x16_shuffle(vsx_i64x2_const(0,0), inc_sum, + 15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30); + inc_sum = vsx_i8x16_add(inc_sum, shfl); + shfl = vsx_i16x8_shuffle(vsx_i64x2_const(0,0), inc_sum, + 7, 8, 9, 10, 11, 12, 13, 14); + inc_sum = vsx_i8x16_add(inc_sum, shfl); + shfl = vsx_i32x4_shuffle(vsx_i64x2_const(0,0), inc_sum, + 3, 4, 5, 6); + inc_sum = vsx_i8x16_add(inc_sum, shfl); + shfl = vsx_i64x2_shuffle(vsx_i64x2_const(0,0), inc_sum, + 1, 2); + inc_sum = vsx_i8x16_add(inc_sum, shfl); + total_bits = vsx_u8x16_extract_lane(inc_sum, 15); + // exclusive scan + ex_sum = vsx_i8x16_shuffle(vsx_i64x2_const(0,0), inc_sum, + 15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30); + + // Spread the 16 bits in cwd to inverted 0 or 1 bytes in + // cwd_vec. Then, convert these to a form suitable + // for coefficient modifications; in particular, a value + // of 0 is presented as binary 11, and a value of 1 is + // represented as binary 01 + v128_t cwd_vec = vsx_i16x8_splat((si16)cwd); + cwd_vec = vsx_i8x16_swizzle(cwd_vec, + vsx_i8x16_const(0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1)); + cwd_vec = vsx_v128_and(cwd_vec, + vsx_u64x2_const(OJPH_REPEAT2(0x8040201008040201))); + cwd_vec = vsx_i8x16_eq(cwd_vec, + vsx_u64x2_const(OJPH_REPEAT2(0x8040201008040201))); + cwd_vec = + vsx_i8x16_add(cwd_vec, vsx_i8x16_const(OJPH_REPEAT16(1))); + cwd_vec = vsx_i8x16_add(cwd_vec, cwd_vec); + cwd_vec = + vsx_v128_or(cwd_vec, vsx_i8x16_const(OJPH_REPEAT16(1))); + + // load data and insert the mrp bit + v128_t m = vsx_i8x16_const(0,-1,-1,-1,4,-1,-1,-1, + 8,-1,-1,-1,12,-1,-1,-1); + ui32 *dp = dpp; + for (int c = 0; c < 4; ++c) { + v128_t s0, s0_sig, s0_idx, s0_val; + // load coefficients + s0 = vsx_v128_load(dp); + // find significant samples in this row + s0_sig = vsx_i8x16_swizzle(sig_vec, m); + s0_sig = vsx_i8x16_eq(s0_sig, vsx_i64x2_const(0, 0)); + // get MRP bit index, and MRP pattern + s0_idx = vsx_i8x16_swizzle(ex_sum, m); + s0_val = vsx_i8x16_swizzle(cwd_vec, s0_idx); + // keep data from significant samples only + s0_val = vsx_v128_andnot(s0_val, s0_sig); + // move mrp bits to correct position, and employ + s0_val = vsx_i32x4_shl(s0_val, p - 2); + s0 = vsx_v128_xor(s0, s0_val); + // store coefficients + vsx_v128_store(dp, s0); + // prepare for next row + dp += stride; + m = vsx_i32x4_add(m, vsx_i32x4_const(OJPH_REPEAT4(1))); + } + } + // consume data according to the number of bits set + rev_advance_mrp(&magref, (ui32)total_bits); + } + } + } + } + + return true; + } + } +} diff --git a/src/core/openjph/ojph_arch.h b/src/core/openjph/ojph_arch.h index 84c2834d..f8ba7992 100644 --- a/src/core/openjph/ojph_arch.h +++ b/src/core/openjph/ojph_arch.h @@ -40,6 +40,7 @@ #ifndef OJPH_ARCH_H #define OJPH_ARCH_H +#include #include #include #include @@ -100,12 +101,20 @@ #define OJPH_ARCH_UNKNOWN #endif +// Only little-endian POWER (ppc64le) is supported for SIMD +#if defined(OJPH_ARCH_PPC64) && \ + (defined(__LITTLE_ENDIAN__) || \ + (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) + #define OJPH_ARCH_PPC64LE +#endif + namespace ojph { //////////////////////////////////////////////////////////////////////////// // disable SIMD for unknown architecture //////////////////////////////////////////////////////////////////////////// #if !defined(OJPH_ARCH_X86_64) && !defined(OJPH_ARCH_I386) && \ - !defined(OJPH_ARCH_ARM) && !defined(OJPH_DISABLE_SIMD) + !defined(OJPH_ARCH_ARM) && !defined(OJPH_ARCH_PPC64LE) && \ + !defined(OJPH_DISABLE_SIMD) #define OJPH_DISABLE_SIMD #endif // !OJPH_ARCH_UNKNOWN @@ -164,6 +173,14 @@ namespace ojph { ARM_CPU_EXT_LEVEL_SVE2 = 3, }; + // POWER9 (ISA 3.0) is the minimum supported SIMD level; older CPUs + // (POWER8 and earlier) use the generic code paths + enum : int { + PPC_CPU_EXT_LEVEL_GENERIC = 0, + PPC_CPU_EXT_LEVEL_ARCH_3_00 = 1, // ISA 3.0 (POWER9) + PPC_CPU_EXT_LEVEL_ARCH_3_1 = 2, // ISA 3.1 (POWER10) + }; + ///////////////////////////////////////////////////////////////////////////// static inline ui32 population_count(ui32 val) { @@ -451,8 +468,11 @@ namespace ojph { // result, irrespective of the machine's endianness static inline ui32 load_le_ui32(const ui8 *p) { - if (is_machine_little_endian) - return *(const ui32 *)p; + if (is_machine_little_endian) { + ui32 val; + std::memcpy(&val, p, sizeof(val)); + return val; + } else return (ui32)p[0] | ((ui32)p[1] << 8) | ((ui32)p[2] << 16) | ((ui32)p[3] << 24); @@ -464,8 +484,11 @@ namespace ojph { // of the machine's endianness static inline ui32 load_le_ui16x2(const ui16 *p) { - if (is_machine_little_endian) - return *(const ui32 *)p; + if (is_machine_little_endian) { + ui32 val; + std::memcpy(&val, p, sizeof(val)); + return val; + } else return (ui32)p[0] | ((ui32)p[1] << 16); } diff --git a/src/core/openjph/ojph_simd_vsx.h b/src/core/openjph/ojph_simd_vsx.h new file mode 100644 index 00000000..fae483d8 --- /dev/null +++ b/src/core/openjph/ojph_simd_vsx.h @@ -0,0 +1,361 @@ +//***************************************************************************/ +// This software is released under the 2-Clause BSD license, included +// below. +// +// Copyright (c) 2026, Aous Naman +// Copyright (c) 2026, Kakadu Software Pty Ltd, Australia +// Copyright (c) 2026, The University of New South Wales, Australia +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************/ +// This file is part of the OpenJPH software implementation. +// File: ojph_simd_vsx.h +// +// 128-bit SIMD helpers for POWER VSX, used by the ojph_*_vsx.cpp +// kernels. Lane numbering and operation semantics follow the same +// conventions as the other 128-bit kernels in this codebase (lane 0 +// is the lowest memory address). Supported targets are POWER9 +// (ISA 3.0) and newer, little-endian only (ppc64le). +//***************************************************************************/ + +#ifndef OJPH_SIMD_VSX_H +#define OJPH_SIMD_VSX_H + +#if !defined(__powerpc64__) && !defined(__PPC64__) + #error "this header is for 64-bit POWER targets only" +#endif +#if !defined(__LITTLE_ENDIAN__) && \ + !(defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) + #error "this header assumes a little-endian target (ppc64le)" +#endif + +#include +#include + +#include "ojph_defs.h" + +// altivec.h leaks these context-sensitive keywords as macros under GNU C; +// they break standard headers and the codebase (e.g. std::vector) +#undef vector +#undef pixel +#undef bool + +typedef __vector unsigned char v128_t; + +typedef __vector signed char vsx_v_i8; +typedef __vector unsigned char vsx_v_u8; +typedef __vector signed short vsx_v_i16; +typedef __vector unsigned short vsx_v_u16; +typedef __vector signed int vsx_v_i32; +typedef __vector unsigned int vsx_v_u32; +typedef __vector signed long long vsx_v_i64; +typedef __vector unsigned long long vsx_v_u64; +typedef __vector float vsx_v_f32; + +//--------------------------------------------------------------------------- +// load/store (alignment-agnostic; lxv/stxv handle unaligned addresses) +//--------------------------------------------------------------------------- +static inline v128_t vsx_v128_load(const void *p) +{ return vec_xl(0, (const unsigned char *)p); } + +static inline void vsx_v128_store(void *p, v128_t a) +{ vec_xst(a, 0, (unsigned char *)p); } + +#define vsx_v128_store32_lane(p, a, i) \ + do { vsx_v_i32 t_ = (vsx_v_i32)(a); int v_ = t_[(i)]; \ + std::memcpy((p), &v_, 4); } while (0) + +//--------------------------------------------------------------------------- +// constants, splats, makes +//--------------------------------------------------------------------------- +// functions, not macros, so that an argument that is itself a macro +// expanding to an argument list (e.g. OJPH_REPEAT4) works +static inline v128_t vsx_i8x16_const( + signed char c0, signed char c1, signed char c2, signed char c3, + signed char c4, signed char c5, signed char c6, signed char c7, + signed char c8, signed char c9, signed char c10, signed char c11, + signed char c12, signed char c13, signed char c14, signed char c15) +{ vsx_v_i8 v = {c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15}; + return (v128_t)v; } +static inline v128_t vsx_i16x8_const(short c0, short c1, short c2, + short c3, short c4, short c5, + short c6, short c7) +{ vsx_v_i16 v = {c0,c1,c2,c3,c4,c5,c6,c7}; return (v128_t)v; } +static inline v128_t vsx_u16x8_const(unsigned short c0, unsigned short c1, + unsigned short c2, unsigned short c3, + unsigned short c4, unsigned short c5, + unsigned short c6, unsigned short c7) +{ vsx_v_u16 v = {c0,c1,c2,c3,c4,c5,c6,c7}; return (v128_t)v; } +static inline v128_t vsx_i32x4_const(int c0, int c1, int c2, int c3) +{ vsx_v_i32 v = {c0,c1,c2,c3}; return (v128_t)v; } +static inline v128_t vsx_u32x4_const(unsigned int c0, unsigned int c1, + unsigned int c2, unsigned int c3) +{ vsx_v_u32 v = {c0,c1,c2,c3}; return (v128_t)v; } +static inline v128_t vsx_i64x2_const(long long c0, long long c1) +{ vsx_v_i64 v = {c0,c1}; return (v128_t)v; } +static inline v128_t vsx_u64x2_const(unsigned long long c0, + unsigned long long c1) +{ vsx_v_u64 v = {c0,c1}; return (v128_t)v; } + +static inline v128_t vsx_i8x16_splat(signed char x) +{ ojph_unused(x); return (v128_t)vec_splats(x); } +static inline v128_t vsx_i16x8_splat(short x) +{ ojph_unused(x); return (v128_t)vec_splats(x); } +static inline v128_t vsx_i32x4_splat(int x) +{ ojph_unused(x); return (v128_t)vec_splats(x); } +static inline v128_t vsx_u32x4_splat(unsigned int x) +{ ojph_unused(x); return (v128_t)vec_splats(x); } +static inline v128_t vsx_i64x2_splat(long long x) +{ ojph_unused(x); return (v128_t)vec_splats((signed long long)x); } +static inline v128_t vsx_f32x4_splat(float x) +{ ojph_unused(x); return (v128_t)vec_splats(x); } + +static inline v128_t vsx_i32x4_make(int a, int b, int c, int d) +{ return (v128_t)(vsx_v_i32){a, b, c, d}; } + +//--------------------------------------------------------------------------- +// lane extraction (subscript is little-endian lane order) +//--------------------------------------------------------------------------- +#define vsx_u8x16_extract_lane(a, i) (((vsx_v_u8)(a))[(i)]) +#define vsx_u16x8_extract_lane(a, i) (((vsx_v_u16)(a))[(i)]) +#define vsx_i32x4_extract_lane(a, i) (((vsx_v_i32)(a))[(i)]) +#define vsx_u32x4_extract_lane(a, i) (((vsx_v_u32)(a))[(i)]) +#define vsx_i64x2_extract_lane(a, i) (((vsx_v_i64)(a))[(i)]) + +//--------------------------------------------------------------------------- +// bitwise +//--------------------------------------------------------------------------- +static inline v128_t vsx_v128_and(v128_t a, v128_t b) +{ return vec_and(a, b); } +static inline v128_t vsx_v128_or(v128_t a, v128_t b) +{ return vec_or(a, b); } +static inline v128_t vsx_v128_xor(v128_t a, v128_t b) +{ return vec_xor(a, b); } +// a & ~b (same operand order as vec_andc) +static inline v128_t vsx_v128_andnot(v128_t a, v128_t b) +{ return vec_andc(a, b); } + +//--------------------------------------------------------------------------- +// integer arithmetic +//--------------------------------------------------------------------------- +static inline v128_t vsx_i8x16_add(v128_t a, v128_t b) +{ return (v128_t)vec_add((vsx_v_i8)a, (vsx_v_i8)b); } +static inline v128_t vsx_i16x8_add(v128_t a, v128_t b) +{ return (v128_t)vec_add((vsx_v_i16)a, (vsx_v_i16)b); } +static inline v128_t vsx_i32x4_add(v128_t a, v128_t b) +{ return (v128_t)vec_add((vsx_v_i32)a, (vsx_v_i32)b); } +static inline v128_t vsx_i64x2_add(v128_t a, v128_t b) +{ return (v128_t)vec_add((vsx_v_i64)a, (vsx_v_i64)b); } + +static inline v128_t vsx_i16x8_sub(v128_t a, v128_t b) +{ return (v128_t)vec_sub((vsx_v_i16)a, (vsx_v_i16)b); } +static inline v128_t vsx_i32x4_sub(v128_t a, v128_t b) +{ return (v128_t)vec_sub((vsx_v_i32)a, (vsx_v_i32)b); } +static inline v128_t vsx_i64x2_sub(v128_t a, v128_t b) +{ return (v128_t)vec_sub((vsx_v_i64)a, (vsx_v_i64)b); } + +// low half of products; vmladduhm / vmuluwm; i64x2 is lowered by the +// compiler (mulld on ISA 3.0, vmulld on ISA 3.1) +static inline v128_t vsx_i16x8_mul(v128_t a, v128_t b) +{ return (v128_t)((vsx_v_i16)a * (vsx_v_i16)b); } +static inline v128_t vsx_i32x4_mul(v128_t a, v128_t b) +{ return (v128_t)((vsx_v_i32)a * (vsx_v_i32)b); } +static inline v128_t vsx_i64x2_mul(v128_t a, v128_t b) +{ return (v128_t)((vsx_v_i64)a * (vsx_v_i64)b); } + +static inline v128_t vsx_i8x16_abs(v128_t a) +{ return (v128_t)vec_abs((vsx_v_i8)a); } +static inline v128_t vsx_u8x16_min(v128_t a, v128_t b) +{ return (v128_t)vec_min((vsx_v_u8)a, (vsx_v_u8)b); } +static inline v128_t vsx_i16x8_max(v128_t a, v128_t b) +{ return (v128_t)vec_max((vsx_v_i16)a, (vsx_v_i16)b); } + +//--------------------------------------------------------------------------- +// shifts (scalar count, modulo lane width) +//--------------------------------------------------------------------------- +static inline v128_t vsx_i16x8_shl(v128_t a, int n) +{ return (v128_t)vec_sl((vsx_v_i16)a, vec_splats((unsigned short)n)); } +static inline v128_t vsx_i32x4_shl(v128_t a, int n) +{ return (v128_t)vec_sl((vsx_v_i32)a, vec_splats((unsigned int)n)); } +static inline v128_t vsx_i64x2_shl(v128_t a, int n) +{ return (v128_t)vec_sl((vsx_v_i64)a, + vec_splats((unsigned long long)n)); } + +static inline v128_t vsx_i32x4_shr(v128_t a, int n) // arithmetic +{ return (v128_t)vec_sra((vsx_v_i32)a, vec_splats((unsigned int)n)); } +static inline v128_t vsx_i64x2_shr(v128_t a, int n) // arithmetic +{ return (v128_t)vec_sra((vsx_v_i64)a, + vec_splats((unsigned long long)n)); } + +static inline v128_t vsx_u16x8_shr(v128_t a, int n) // logical +{ return (v128_t)vec_sr((vsx_v_u16)a, vec_splats((unsigned short)n)); } +static inline v128_t vsx_u32x4_shr(v128_t a, int n) // logical +{ return (v128_t)vec_sr((vsx_v_u32)a, vec_splats((unsigned int)n)); } +static inline v128_t vsx_u64x2_shr(v128_t a, int n) // logical +{ return (v128_t)vec_sr((vsx_v_u64)a, + vec_splats((unsigned long long)n)); } + +//--------------------------------------------------------------------------- +// comparisons (true lanes -> all-ones, false lanes -> all-zeros) +//--------------------------------------------------------------------------- +static inline v128_t vsx_i8x16_eq(v128_t a, v128_t b) +{ return (v128_t)vec_cmpeq((vsx_v_i8)a, (vsx_v_i8)b); } +static inline v128_t vsx_i16x8_eq(v128_t a, v128_t b) +{ return (v128_t)vec_cmpeq((vsx_v_i16)a, (vsx_v_i16)b); } +static inline v128_t vsx_i32x4_eq(v128_t a, v128_t b) +{ return (v128_t)vec_cmpeq((vsx_v_i32)a, (vsx_v_i32)b); } + +static inline v128_t vsx_i8x16_gt(v128_t a, v128_t b) +{ return (v128_t)vec_cmpgt((vsx_v_i8)a, (vsx_v_i8)b); } +static inline v128_t vsx_i32x4_gt(v128_t a, v128_t b) +{ return (v128_t)vec_cmpgt((vsx_v_i32)a, (vsx_v_i32)b); } +static inline v128_t vsx_i32x4_lt(v128_t a, v128_t b) +{ return (v128_t)vec_cmplt((vsx_v_i32)a, (vsx_v_i32)b); } +static inline v128_t vsx_i64x2_lt(v128_t a, v128_t b) +{ return (v128_t)vec_cmplt((vsx_v_i64)a, (vsx_v_i64)b); } + +static inline v128_t vsx_f32x4_ge(v128_t a, v128_t b) +{ return (v128_t)vec_cmpge((vsx_v_f32)a, (vsx_v_f32)b); } +static inline v128_t vsx_f32x4_lt(v128_t a, v128_t b) +{ return (v128_t)vec_cmplt((vsx_v_f32)a, (vsx_v_f32)b); } + +//--------------------------------------------------------------------------- +// float arithmetic and conversions +//--------------------------------------------------------------------------- +static inline v128_t vsx_f32x4_add(v128_t a, v128_t b) +{ return (v128_t)vec_add((vsx_v_f32)a, (vsx_v_f32)b); } +static inline v128_t vsx_f32x4_sub(v128_t a, v128_t b) +{ return (v128_t)vec_sub((vsx_v_f32)a, (vsx_v_f32)b); } +static inline v128_t vsx_f32x4_mul(v128_t a, v128_t b) +{ return (v128_t)vec_mul((vsx_v_f32)a, (vsx_v_f32)b); } + +// xvcvspsxws: truncating, saturating (NaN gives 0x80000000; the +// callers never pass NaN) +static inline v128_t vsx_i32x4_trunc_sat_f32x4(v128_t a) +{ return (v128_t)vec_cts((vsx_v_f32)a, 0); } +static inline v128_t vsx_f32x4_convert_i32x4(v128_t a) +{ return (v128_t)vec_ctf((vsx_v_i32)a, 0); } + +//--------------------------------------------------------------------------- +// widening +//--------------------------------------------------------------------------- +static inline v128_t vsx_i64x2_extend_low_i32x4(v128_t a) +{ + // vsx_v_i32 v = (vsx_v_i32)a; + // return (v128_t)__builtin_convertvector( + // __builtin_shufflevector(v, v, 0, 1), vsx_v_i64); + + // Unpacks and sign-extends elements 0 and 1 on Little Endian + return (v128_t)vec_unpackl((vsx_v_i32)a); +} +static inline v128_t vsx_i64x2_extend_high_i32x4(v128_t a) +{ + // vsx_v_i32 v = (vsx_v_i32)a; + // return (v128_t)__builtin_convertvector( + // __builtin_shufflevector(v, v, 2, 3), vsx_v_i64); + + // Unpacks and sign-extends elements 2 and 3 on Little Endian + return (v128_t)vec_unpackh((vsx_v_i32)a); +} + +//--------------------------------------------------------------------------- +// shuffles (immediate lane indices; 0..N-1 from a, N..2N-1 from b) +//--------------------------------------------------------------------------- +// #define vsx_i8x16_shuffle(a, b, c0,c1,c2,c3,c4,c5,c6,c7, +// c8,c9,c10,c11,c12,c13,c14,c15) +// ((v128_t)__builtin_shufflevector((vsx_v_u8)(a), (vsx_v_u8)(b), +// c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15)) +// #define vsx_i16x8_shuffle(a, b, c0,c1,c2,c3,c4,c5,c6,c7) +// ((v128_t)__builtin_shufflevector((vsx_v_i16)(a), (vsx_v_i16)(b), +// c0,c1,c2,c3,c4,c5,c6,c7)) +// #define vsx_i32x4_shuffle(a, b, c0,c1,c2,c3) +// ((v128_t)__builtin_shufflevector((vsx_v_i32)(a), (vsx_v_i32)(b), +// c0,c1,c2,c3)) +// #define vsx_i64x2_shuffle(a, b, c0,c1) +// ((v128_t)__builtin_shufflevector((vsx_v_i64)(a), (vsx_v_i64)(b), c0,c1)) + +// 8-bit Shuffle (Maps direct element indices to raw byte indices) +#define vsx_i8x16_shuffle(a, b, c0,c1,c2,c3,c4,c5,c6,c7, \ + c8,c9,c10,c11,c12,c13,c14,c15) \ + ((v128_t)vec_perm((vsx_v_u8)(a), (vsx_v_u8)(b), (vsx_v_u8){ \ + (c0), (c1), (c2), (c3), (c4), (c5), (c6), (c7), \ + (c8), (c9), (c10),(c11),(c12),(c13),(c14),(c15) \ + })) + +// 16-bit Shuffle (Multiplies element index by 2 to get byte offsets) +#define vsx_i16x8_shuffle(a, b, c0,c1,c2,c3,c4,c5,c6,c7) \ + ((v128_t)vec_perm((vsx_v_u8)(a), (vsx_v_u8)(b), (vsx_v_u8){ \ + (c0)*2, (c0)*2+1, (c1)*2, (c1)*2+1, \ + (c2)*2, (c2)*2+1, (c3)*2, (c3)*2+1, \ + (c4)*2, (c4)*2+1, (c5)*2, (c5)*2+1, \ + (c6)*2, (c6)*2+1, (c7)*2, (c7)*2+1 \ + })) + +// 32-bit Shuffle (Multiplies element index by 4 to get byte offsets) +#define vsx_i32x4_shuffle(a, b, c0,c1,c2,c3) \ + ((v128_t)vec_perm((vsx_v_u8)(a), (vsx_v_u8)(b), (vsx_v_u8){ \ + (c0)*4, (c0)*4+1, (c0)*4+2, (c0)*4+3, \ + (c1)*4, (c1)*4+1, (c1)*4+2, (c1)*4+3, \ + (c2)*4, (c2)*4+1, (c2)*4+2, (c2)*4+3, \ + (c3)*4, (c3)*4+1, (c3)*4+2, (c3)*4+3 \ + })) + +// 64-bit Shuffle (Multiplies element index by 8 to get byte offsets) +#define vsx_i64x2_shuffle(a, b, c0,c1) \ + ((v128_t)vec_perm((vsx_v_u8)(a), (vsx_v_u8)(b), (vsx_v_u8){ \ + (c0)*8, (c0)*8+1, (c0)*8+2, (c0)*8+3, \ + (c0)*8+4, (c0)*8+5, (c0)*8+6, (c0)*8+7, \ + (c1)*8, (c1)*8+1, (c1)*8+2, (c1)*8+3, \ + (c1)*8+4, (c1)*8+5, (c1)*8+6, (c1)*8+7 \ + })) + +//--------------------------------------------------------------------------- +// swizzle: runtime byte-table lookup; lanes with index > 15 give 0 +//--------------------------------------------------------------------------- +static inline v128_t vsx_i8x16_swizzle(v128_t a, v128_t idx) +{ + v128_t r = vec_perm(a, a, idx); + v128_t oob = (v128_t)vec_cmpgt((vsx_v_u8)idx, + vec_splats((unsigned char)15)); + return vec_andc(r, oob); +} + +//--------------------------------------------------------------------------- +// bitmask: MSB of each byte lane -> bit of result, lane 0 -> bit 0 +// (vbpermq gathers the 16 selected bits into bits 48..63 of the +// big-endian first doubleword, which is doubleword 1 on ppc64le) +//--------------------------------------------------------------------------- +static inline int vsx_i8x16_bitmask(v128_t a) +{ +#if defined(__POWER10_VECTOR__) + return (int)vec_extractm(a); // ISA 3.1 native movemask +#else + const vsx_v_u8 perm = { 120, 112, 104, 96, 88, 80, 72, 64, + 56, 48, 40, 32, 24, 16, 8, 0 }; + vsx_v_u64 r = (vsx_v_u64)vec_bperm(a, perm); + return (int)r[1]; +#endif +} + +#endif // OJPH_SIMD_VSX_H diff --git a/src/core/others/ojph_arch.cpp b/src/core/others/ojph_arch.cpp index 1f062041..0ab12b8a 100644 --- a/src/core/others/ojph_arch.cpp +++ b/src/core/others/ojph_arch.cpp @@ -229,7 +229,38 @@ namespace ojph { #endif - #else // architectures other than Intel/AMD and ARM + #elif defined(OJPH_ARCH_PPC64LE) + + #if defined(OJPH_OS_LINUX) + + #include + #include + + bool init_cpu_ext_level(int& level) { + unsigned long hwcap = getauxval(AT_HWCAP); + unsigned long hwcap2 = getauxval(AT_HWCAP2); + level = PPC_CPU_EXT_LEVEL_GENERIC; + if ((hwcap & PPC_FEATURE_HAS_VSX) && + (hwcap2 & PPC_FEATURE2_ARCH_3_00)) { + level = PPC_CPU_EXT_LEVEL_ARCH_3_00; + #ifdef PPC_FEATURE2_ARCH_3_1 + if (hwcap2 & PPC_FEATURE2_ARCH_3_1) + level = PPC_CPU_EXT_LEVEL_ARCH_3_1; + #endif + } + return true; + } + + #else // !OJPH_OS_LINUX + + bool init_cpu_ext_level(int& level) { + level = PPC_CPU_EXT_LEVEL_GENERIC; + return true; + } + + #endif + + #else // architectures other than Intel/AMD, ARM, and PPC64LE //////////////////////////////////////////////////////////////////////////// bool init_cpu_ext_level(int& level) { diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp index ef147ade..6e9d9709 100644 --- a/src/core/transform/ojph_colour.cpp +++ b/src/core/transform/ojph_colour.cpp @@ -177,6 +177,25 @@ namespace ojph { #elif defined(OJPH_ARCH_ARM) + #elif defined(OJPH_ARCH_PPC64LE) + + if (get_cpu_ext_level() >= PPC_CPU_EXT_LEVEL_ARCH_3_00) + { + // 128-bit VSX kernels; see ojph_simd_vsx.h + rev_convert = vsx_rev_convert; + rev_convert_nlt_type3 = vsx_rev_convert_nlt_type3; + irv_convert_to_integer = vsx_irv_convert_to_integer; + irv_convert_to_float = vsx_irv_convert_to_float; + irv_convert_to_integer_nlt_type3 = + vsx_irv_convert_to_integer_nlt_type3; + irv_convert_to_float_nlt_type3 = + vsx_irv_convert_to_float_nlt_type3; + rct_forward = vsx_rct_forward; + rct_backward = vsx_rct_backward; + ict_forward = vsx_ict_forward; + ict_backward = vsx_ict_backward; + } + #endif // !(defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386)) #endif // !OJPH_DISABLE_SIMD diff --git a/src/core/transform/ojph_colour_local.h b/src/core/transform/ojph_colour_local.h index a85bf8bd..abe7eea0 100644 --- a/src/core/transform/ojph_colour_local.h +++ b/src/core/transform/ojph_colour_local.h @@ -2,21 +2,21 @@ // This software is released under the 2-Clause BSD license, included // below. // -// Copyright (c) 2019, Aous Naman +// Copyright (c) 2019, Aous Naman // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia // Copyright (c) 2019, The University of New South Wales, Australia -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: -// +// // 1. Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. -// +// // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. -// +// // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A @@ -66,14 +66,14 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// void gen_rev_convert( - const line_buf *src_line, const ui32 src_line_offset, - line_buf *dst_line, const ui32 dst_line_offset, + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// void gen_rev_convert_nlt_type3( - const line_buf *src_line, const ui32 src_line_offset, - line_buf *dst_line, const ui32 dst_line_offset, + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// @@ -158,14 +158,14 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// void sse2_rev_convert( - const line_buf *src_line, const ui32 src_line_offset, - line_buf *dst_line, const ui32 dst_line_offset, + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// void sse2_rev_convert_nlt_type3( - const line_buf *src_line, const ui32 src_line_offset, - line_buf *dst_line, const ui32 dst_line_offset, + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// @@ -214,14 +214,14 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// void avx2_rev_convert( - const line_buf *src_line, const ui32 src_line_offset, - line_buf *dst_line, const ui32 dst_line_offset, + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// void avx2_rev_convert_nlt_type3( - const line_buf *src_line, const ui32 src_line_offset, - line_buf *dst_line, const ui32 dst_line_offset, + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// @@ -257,7 +257,7 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// // // - // WASM Functions + // WASM Functions // // ////////////////////////////////////////////////////////////////////////// @@ -274,14 +274,14 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// void wasm_rev_convert( - const line_buf *src_line, const ui32 src_line_offset, - line_buf *dst_line, const ui32 dst_line_offset, + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// void wasm_rev_convert_nlt_type3( - const line_buf *src_line, const ui32 src_line_offset, - line_buf *dst_line, const ui32 dst_line_offset, + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// @@ -312,6 +312,64 @@ namespace ojph { void wasm_ict_backward(const float *y, const float *cb, const float *cr, float *r, float *g, float *b, ui32 repeat); + ////////////////////////////////////////////////////////////////////////// + // + // + // VSX Functions + // + // + ////////////////////////////////////////////////////////////////////////// + + ////////////////////////////////////////////////////////////////////////// + void vsx_irv_convert_to_integer( + const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width); + + ////////////////////////////////////////////////////////////////////////// + void vsx_irv_convert_to_float( + const line_buf *src_line, ui32 src_line_offset, + line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width); + + ////////////////////////////////////////////////////////////////////////// + void vsx_rev_convert( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); + + ////////////////////////////////////////////////////////////////////////// + void vsx_rev_convert_nlt_type3( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); + + ////////////////////////////////////////////////////////////////////////// + void vsx_irv_convert_to_integer_nlt_type3( + const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width); + + ////////////////////////////////////////////////////////////////////////// + void vsx_irv_convert_to_float_nlt_type3( + const line_buf *src_line, ui32 src_line_offset, + line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width); + + ////////////////////////////////////////////////////////////////////////// + void vsx_rct_forward( + const line_buf *r, const line_buf *g, const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat); + + ////////////////////////////////////////////////////////////////////////// + void vsx_rct_backward( + const line_buf *y, const line_buf *cb, const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, ui32 repeat); + + ////////////////////////////////////////////////////////////////////////// + void vsx_ict_forward(const float *r, const float *g, const float *b, + float *y, float *cb, float *cr, ui32 repeat); + + ////////////////////////////////////////////////////////////////////////// + void vsx_ict_backward(const float *y, const float *cb, const float *cr, + float *r, float *g, float *b, ui32 repeat); + } } diff --git a/src/core/transform/ojph_colour_vsx.cpp b/src/core/transform/ojph_colour_vsx.cpp new file mode 100644 index 00000000..ef358245 --- /dev/null +++ b/src/core/transform/ojph_colour_vsx.cpp @@ -0,0 +1,668 @@ +//***************************************************************************/ +// This software is released under the 2-Clause BSD license, included +// below. +// +// Copyright (c) 2021, Aous Naman +// Copyright (c) 2021, Kakadu Software Pty Ltd, Australia +// Copyright (c) 2021, The University of New South Wales, Australia +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************/ +// This file is part of the OpenJPH software implementation. +// File: ojph_colour_vsx.cpp +// Author: Aous Naman +// Date: 9 February 2021 +//***************************************************************************/ + +#include +#include +#include "ojph_simd_vsx.h" + +#include "ojph_defs.h" +#include "ojph_mem.h" +#include "ojph_colour.h" +#include "ojph_colour_local.h" + +namespace ojph { + namespace local { + + ////////////////////////////////////////////////////////////////////////// + static inline + v128_t ojph_convert_float_to_i32(v128_t a) + { // We implement ojph_round, which is + // val + (val >= 0.0f ? 0.5f : -0.5f), where val is float; this is + // round to nearest with ties away from zero, which is exactly what + // xvrspi does. The instruction is used via inline asm because + // GCC's vec_round rounds ties to even. + vsx_v_f32 w; + __asm__("xvrspi %x0,%x1" : "=wa"(w) : "wa"((vsx_v_f32)a)); + return (v128_t)vec_cts(w, 0); // saturating convert to int32 + } + + ////////////////////////////////////////////////////////////////////////// + void vsx_rev_convert(const line_buf *src_line, + const ui32 src_line_offset, + line_buf *dst_line, + const ui32 dst_line_offset, + si64 shift, ui32 width) + { + if (src_line->flags & line_buf::LFT_32BIT) + { + if (dst_line->flags & line_buf::LFT_32BIT) + { + const si32 *sp = src_line->i32 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + v128_t sh = vsx_i32x4_splat((si32)shift); + for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + { + v128_t s = vsx_v128_load(sp); + s = vsx_i32x4_add(s, sh); + vsx_v128_store(dp, s); + } + } + else + { + const si32 *sp = src_line->i32 + src_line_offset; + si64 *dp = dst_line->i64 + dst_line_offset; + v128_t sh = vsx_i64x2_splat(shift); + for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + { + v128_t s, t; + s = vsx_v128_load(sp); + + t = vsx_i64x2_extend_low_i32x4(s); + t = vsx_i64x2_add(t, sh); + vsx_v128_store(dp, t); + + t = vsx_i64x2_extend_high_i32x4(s); + t = vsx_i64x2_add(t, sh); + vsx_v128_store(dp + 2, t); + } + } + } + else + { + assert(src_line->flags | line_buf::LFT_64BIT); + assert(dst_line->flags | line_buf::LFT_32BIT); + const si64 *sp = src_line->i64 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + v128_t sh = vsx_i64x2_splat(shift); + for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + { + v128_t s0, s1; + s0 = vsx_v128_load(sp); + s0 = vsx_i64x2_add(s0, sh); + s1 = vsx_v128_load(sp + 2); + s1 = vsx_i64x2_add(s1, sh); + s0 = vsx_i32x4_shuffle(s0, s1, 0, 2, 4 + 0, 4 + 2); + vsx_v128_store(dp, s0); + } + } + } + + ////////////////////////////////////////////////////////////////////////// + void vsx_rev_convert_nlt_type3(const line_buf *src_line, + const ui32 src_line_offset, + line_buf *dst_line, + const ui32 dst_line_offset, + si64 shift, ui32 width) + { + if (src_line->flags & line_buf::LFT_32BIT) + { + if (dst_line->flags & line_buf::LFT_32BIT) + { + const si32 *sp = src_line->i32 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + v128_t sh = vsx_i32x4_splat((si32)(-shift)); + v128_t zero = vsx_i32x4_splat(0); + for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4) + { + v128_t s = vsx_v128_load(sp); + v128_t c = vsx_i32x4_lt(s, zero); // 0xFFFFFFFF for -ve value + v128_t v_m_sh = vsx_i32x4_sub(sh, s); // - shift - value + v_m_sh = vsx_v128_and(c, v_m_sh); // keep only - shift - value + s = vsx_v128_andnot(s, c); // keep only +ve or 0 + s = vsx_v128_or(s, v_m_sh); // combine + vsx_v128_store(dp, s); + } + } + else + { + const si32 *sp = src_line->i32 + src_line_offset; + si64 *dp = dst_line->i64 + dst_line_offset; + v128_t sh = vsx_i64x2_splat(-shift); + v128_t zero = vsx_i32x4_splat(0); + for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4) + { + v128_t s, u, c, v_m_sh; + s = vsx_v128_load(sp); + + u = vsx_i64x2_extend_low_i32x4(s); + c = vsx_i64x2_lt(u, zero); // 64b -1 for -ve value + v_m_sh = vsx_i64x2_sub(sh, u); // - shift - value + v_m_sh = vsx_v128_and(c, v_m_sh); // keep only - shift - value + u = vsx_v128_andnot(u, c); // keep only +ve or 0 + u = vsx_v128_or(u, v_m_sh); // combine + + vsx_v128_store(dp, u); + + u = vsx_i64x2_extend_high_i32x4(s); + c = vsx_i64x2_lt(u, zero); // 64b -1 for -ve value + v_m_sh = vsx_i64x2_sub(sh, u); // - shift - value + v_m_sh = vsx_v128_and(c, v_m_sh); // keep only - shift - value + u = vsx_v128_andnot(u, c); // keep only +ve or 0 + u = vsx_v128_or(u, v_m_sh); // combine + + vsx_v128_store(dp + 2, u); + } + } + } + else + { + assert(src_line->flags | line_buf::LFT_64BIT); + assert(dst_line->flags | line_buf::LFT_32BIT); + const si64 *sp = src_line->i64 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + v128_t sh = vsx_i64x2_splat(-shift); + v128_t zero = vsx_i32x4_splat(0); + for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4) + { + // s for source, t for target, p for positive, n for negative, + // m for mask, and tm for temp + v128_t s, t0, t1, p, n, m, tm; + s = vsx_v128_load(sp); + m = vsx_i64x2_lt(s, zero); // 64b -1 for -ve value + tm = vsx_i64x2_sub(sh, s); // - shift - value + n = vsx_v128_and(m, tm); // -ve + p = vsx_v128_andnot(s, m); // +ve + t0 = vsx_v128_or(n, p); + + s = vsx_v128_load(sp + 2); + m = vsx_i64x2_lt(s, zero); // 64b -1 for -ve value + tm = vsx_i64x2_sub(sh, s); // - shift - value + n = vsx_v128_and(m, tm); // -ve + p = vsx_v128_andnot(s, m); // +ve + t1 = vsx_v128_or(n, p); + + t0 = vsx_i32x4_shuffle(t0, t1, 0, 2, 4 + 0, 4 + 2); + vsx_v128_store(dp, t0); + } + } + } + + ////////////////////////////////////////////////////////////////////////// + void vsx_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, + ui32 width) + { + v128_t shift = vsx_f32x4_splat(0.5f); + v128_t m = vsx_f32x4_splat(mul); + for (ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + { + v128_t t = vsx_v128_load(sp); + v128_t s = vsx_f32x4_convert_i32x4(t); + s = vsx_f32x4_mul(s, m); + s = vsx_f32x4_sub(s, shift); + vsx_v128_store(dp, s); + } + } + + ////////////////////////////////////////////////////////////////////////// + void vsx_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul, + ui32 width) + { + v128_t m = vsx_f32x4_splat(mul); + for (ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + { + v128_t t = vsx_v128_load(sp); + v128_t s = vsx_f32x4_convert_i32x4(t); + s = vsx_f32x4_mul(s, m); + vsx_v128_store(dp, s); + } + } + + ////////////////////////////////////////////////////////////////////////// + void vsx_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, + ui32 width) + { + const v128_t half = vsx_f32x4_splat(0.5f); + v128_t m = vsx_f32x4_splat(mul); + for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + { + v128_t t = vsx_v128_load(sp); + v128_t s = vsx_f32x4_add(t, half); + s = vsx_f32x4_mul(s, m); + s = vsx_f32x4_add(s, half); // + 0.5 and followed by floor next + vsx_v128_store(dp, ojph_convert_float_to_i32(s)); + } + } + + ////////////////////////////////////////////////////////////////////////// + void vsx_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, + ui32 width) + { + const v128_t half = vsx_f32x4_splat(0.5f); + v128_t m = vsx_f32x4_splat(mul); + for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + { + v128_t t = vsx_v128_load(sp); + v128_t s = vsx_f32x4_mul(t, m); + s = vsx_f32x4_add(s, half); // + 0.5 and followed by floor next + vsx_v128_store(dp, ojph_convert_float_to_i32(s)); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + v128_t ojph_vsx_i32x4_max_ge(v128_t a, v128_t b, v128_t x, v128_t y) + { + v128_t c = vsx_f32x4_ge(x, y); // 0xFFFFFFFF for x >= y + return (v128_t)vec_sel((vsx_v_u32)b, (vsx_v_u32)a, (vsx_v_u32)c); + } + + ////////////////////////////////////////////////////////////////////////// + static inline + v128_t ojph_vsx_i32x4_min_lt(v128_t a, v128_t b, v128_t x, v128_t y) + { + v128_t c = vsx_f32x4_lt(x, y); // 0xFFFFFFFF for x < y + return (v128_t)vec_sel((vsx_v_u32)b, (vsx_v_u32)a, (vsx_v_u32)c); + } + + ////////////////////////////////////////////////////////////////////////// + template + static inline + void local_vsx_irv_convert_to_integer(const line_buf *src_line, + line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width) + { + assert((src_line->flags & line_buf::LFT_32BIT) && + (src_line->flags & line_buf::LFT_INTEGER) == 0 && + (dst_line->flags & line_buf::LFT_32BIT) && + (dst_line->flags & line_buf::LFT_INTEGER)); + + assert(bit_depth <= 32); + const float* sp = src_line->f32; + si32* dp = dst_line->i32 + dst_line_offset; + // There is the possibility that converting to integer will + // exceed the dynamic range of 32bit integer; therefore, care must be + // exercised. + // We look if the floating point number is outside the half-closed + // interval [-0.5f, 0.5f). If so, we limit the resulting integer + // to the maximum/minimum that number supports. + si32 neg_limit = (si32)INT_MIN >> (32 - bit_depth); + v128_t mul = vsx_f32x4_splat((float)(1ull << bit_depth)); + v128_t fl_up_lim = vsx_f32x4_splat(-(float)neg_limit); // val < upper + v128_t fl_low_lim = vsx_f32x4_splat((float)neg_limit); // val >= lower + v128_t s32_up_lim = vsx_i32x4_splat(INT_MAX >> (32 - bit_depth)); + v128_t s32_low_lim = vsx_i32x4_splat(INT_MIN >> (32 - bit_depth)); + + if (is_signed) + { + const v128_t zero = vsx_f32x4_splat(0.0f); + v128_t bias = vsx_i32x4_splat(-(si32)((1ULL << (bit_depth - 1)) + 1)); + for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) { + v128_t t = vsx_v128_load(sp); + t = vsx_f32x4_mul(t, mul); + v128_t u = ojph_convert_float_to_i32(t); + u = ojph_vsx_i32x4_max_ge(u, s32_low_lim, t, fl_low_lim); + u = ojph_vsx_i32x4_min_lt(u, s32_up_lim, t, fl_up_lim); + if (NLT_TYPE3) + { + v128_t c = vsx_i32x4_gt(zero, u); // 0xFFFFFFFF for -ve value + v128_t neg = vsx_i32x4_sub(bias, u); // -bias -value + neg = vsx_v128_and(c, neg); // keep only - bias - value + u = vsx_v128_andnot(u, c); // keep only +ve or 0 + u = vsx_v128_or(neg, u); // combine + } + vsx_v128_store(dp, u); + } + } + else + { + v128_t ihalf = vsx_i32x4_splat((si32)(1ULL << (bit_depth - 1))); + for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) { + v128_t t = vsx_v128_load(sp); + t = vsx_f32x4_mul(t, mul); + v128_t u = ojph_convert_float_to_i32(t); + u = ojph_vsx_i32x4_max_ge(u, s32_low_lim, t, fl_low_lim); + u = ojph_vsx_i32x4_min_lt(u, s32_up_lim, t, fl_up_lim); + u = vsx_i32x4_add(u, ihalf); + vsx_v128_store(dp, u); + } + } + } + + ////////////////////////////////////////////////////////////////////////// + void vsx_irv_convert_to_integer(const line_buf *src_line, + line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width) + { + local_vsx_irv_convert_to_integer(src_line, dst_line, + dst_line_offset, bit_depth, is_signed, width); + } + + ////////////////////////////////////////////////////////////////////////// + void vsx_irv_convert_to_integer_nlt_type3(const line_buf *src_line, + line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width) + { + local_vsx_irv_convert_to_integer(src_line, dst_line, + dst_line_offset, bit_depth, is_signed, width); + } + + ////////////////////////////////////////////////////////////////////////// + template + static inline + void local_vsx_irv_convert_to_float(const line_buf *src_line, + ui32 src_line_offset, line_buf *dst_line, + ui32 bit_depth, bool is_signed, ui32 width) + { + assert((src_line->flags & line_buf::LFT_32BIT) && + (src_line->flags & line_buf::LFT_INTEGER) && + (dst_line->flags & line_buf::LFT_32BIT) && + (dst_line->flags & line_buf::LFT_INTEGER) == 0); + + assert(bit_depth <= 32); + v128_t mul = vsx_f32x4_splat((float)(1.0 / (double)(1ULL << bit_depth))); + + const si32* sp = src_line->i32 + src_line_offset; + float* dp = dst_line->f32; + if (is_signed) + { + v128_t zero = vsx_i32x4_splat(0); + v128_t bias = vsx_i32x4_splat(-(si32)((1ULL << (bit_depth - 1)) + 1)); + for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) { + v128_t t = vsx_v128_load(sp); + if (NLT_TYPE3) + { + v128_t c = vsx_i32x4_lt(t, zero); // 0xFFFFFFFF for -ve value + v128_t neg = vsx_i32x4_sub(bias, t); // - bias - value + neg = vsx_v128_and(c, neg); // keep only - bias - value + c = vsx_v128_andnot(t, c); // keep only +ve or 0 + t = vsx_v128_or(neg, c); // combine + } + v128_t v = vsx_f32x4_convert_i32x4(t); + v = vsx_f32x4_mul(v, mul); + vsx_v128_store(dp, v); + } + } + else + { + v128_t half = vsx_i32x4_splat((si32)(1ULL << (bit_depth - 1))); + for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) { + v128_t t = vsx_v128_load(sp); + t = vsx_i32x4_sub(t, half); + v128_t v = vsx_f32x4_convert_i32x4(t); + v = vsx_f32x4_mul(v, mul); + vsx_v128_store(dp, v); + } + } + } + + ////////////////////////////////////////////////////////////////////////// + void vsx_irv_convert_to_float(const line_buf *src_line, + ui32 src_line_offset, line_buf *dst_line, + ui32 bit_depth, bool is_signed, ui32 width) + { + local_vsx_irv_convert_to_float(src_line, src_line_offset, + dst_line, bit_depth, is_signed, width); + } + + ////////////////////////////////////////////////////////////////////////// + void vsx_irv_convert_to_float_nlt_type3(const line_buf *src_line, + ui32 src_line_offset, line_buf *dst_line, + ui32 bit_depth, bool is_signed, ui32 width) + { + local_vsx_irv_convert_to_float(src_line, src_line_offset, + dst_line, bit_depth, is_signed, width); + } + + ////////////////////////////////////////////////////////////////////////// + void vsx_rct_forward(const line_buf *r, + const line_buf *g, + const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, + ui32 repeat) + { + assert((y->flags & line_buf::LFT_INTEGER) && + (cb->flags & line_buf::LFT_INTEGER) && + (cr->flags & line_buf::LFT_INTEGER) && + (r->flags & line_buf::LFT_INTEGER) && + (g->flags & line_buf::LFT_INTEGER) && + (b->flags & line_buf::LFT_INTEGER)); + + if (y->flags & line_buf::LFT_32BIT) + { + assert((y->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && + (cr->flags & line_buf::LFT_32BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32; + si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32; + + for (int i = (repeat + 3) >> 2; i > 0; --i) + { + v128_t mr = vsx_v128_load(rp); + v128_t mg = vsx_v128_load(gp); + v128_t mb = vsx_v128_load(bp); + v128_t t = vsx_i32x4_add(mr, mb); + t = vsx_i32x4_add(t, vsx_i32x4_shl(mg, 1)); + vsx_v128_store(yp, vsx_i32x4_shr(t, 2)); + t = vsx_i32x4_sub(mb, mg); + vsx_v128_store(cbp, t); + t = vsx_i32x4_sub(mr, mg); + vsx_v128_store(crp, t); + + rp += 4; gp += 4; bp += 4; + yp += 4; cbp += 4; crp += 4; + } + } + else + { + assert((y->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && + (cr->flags & line_buf::LFT_64BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; + for (int i = (repeat + 3) >> 2; i > 0; --i) + { + v128_t mr32 = vsx_v128_load(rp); + v128_t mg32 = vsx_v128_load(gp); + v128_t mb32 = vsx_v128_load(bp); + v128_t mr, mg, mb, t; + mr = vsx_i64x2_extend_low_i32x4(mr32); + mg = vsx_i64x2_extend_low_i32x4(mg32); + mb = vsx_i64x2_extend_low_i32x4(mb32); + + t = vsx_i64x2_add(mr, mb); + t = vsx_i64x2_add(t, vsx_i64x2_shl(mg, 1)); + vsx_v128_store(yp, vsx_i64x2_shr(t, 2)); + t = vsx_i64x2_sub(mb, mg); + vsx_v128_store(cbp, t); + t = vsx_i64x2_sub(mr, mg); + vsx_v128_store(crp, t); + + yp += 2; cbp += 2; crp += 2; + + mr = vsx_i64x2_extend_high_i32x4(mr32); + mg = vsx_i64x2_extend_high_i32x4(mg32); + mb = vsx_i64x2_extend_high_i32x4(mb32); + + t = vsx_i64x2_add(mr, mb); + t = vsx_i64x2_add(t, vsx_i64x2_shl(mg, 1)); + vsx_v128_store(yp, vsx_i64x2_shr(t, 2)); + t = vsx_i64x2_sub(mb, mg); + vsx_v128_store(cbp, t); + t = vsx_i64x2_sub(mr, mg); + vsx_v128_store(crp, t); + + rp += 4; gp += 4; bp += 4; + yp += 2; cbp += 2; crp += 2; + } + } + } + + ////////////////////////////////////////////////////////////////////////// + void vsx_rct_backward(const line_buf *y, + const line_buf *cb, + const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, + ui32 repeat) + { + assert((y->flags & line_buf::LFT_INTEGER) && + (cb->flags & line_buf::LFT_INTEGER) && + (cr->flags & line_buf::LFT_INTEGER) && + (r->flags & line_buf::LFT_INTEGER) && + (g->flags & line_buf::LFT_INTEGER) && + (b->flags & line_buf::LFT_INTEGER)); + + if (y->flags & line_buf::LFT_32BIT) + { + assert((y->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && + (cr->flags & line_buf::LFT_32BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32; + si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + for (int i = (repeat + 3) >> 2; i > 0; --i) + { + v128_t my = vsx_v128_load(yp); + v128_t mcb = vsx_v128_load(cbp); + v128_t mcr = vsx_v128_load(crp); + + v128_t t = vsx_i32x4_add(mcb, mcr); + t = vsx_i32x4_sub(my, vsx_i32x4_shr(t, 2)); + vsx_v128_store(gp, t); + v128_t u = vsx_i32x4_add(mcb, t); + vsx_v128_store(bp, u); + u = vsx_i32x4_add(mcr, t); + vsx_v128_store(rp, u); + + yp += 4; cbp += 4; crp += 4; + rp += 4; gp += 4; bp += 4; + } + } + else + { + assert((y->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && + (cr->flags & line_buf::LFT_64BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; + si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + for (int i = (repeat + 3) >> 2; i > 0; --i) + { + v128_t my, mcb, mcr, tr0, tg0, tb0, tr1, tg1, tb1; + my = vsx_v128_load(yp); + mcb = vsx_v128_load(cbp); + mcr = vsx_v128_load(crp); + + tg0 = vsx_i64x2_add(mcb, mcr); + tg0 = vsx_i64x2_sub(my, vsx_i64x2_shr(tg0, 2)); + tb0 = vsx_i64x2_add(mcb, tg0); + tr0 = vsx_i64x2_add(mcr, tg0); + + yp += 2; cbp += 2; crp += 2; + + my = vsx_v128_load(yp); + mcb = vsx_v128_load(cbp); + mcr = vsx_v128_load(crp); + + tg1 = vsx_i64x2_add(mcb, mcr); + tg1 = vsx_i64x2_sub(my, vsx_i64x2_shr(tg1, 2)); + tb1 = vsx_i64x2_add(mcb, tg1); + tr1 = vsx_i64x2_add(mcr, tg1); + + tr0 = vsx_i32x4_shuffle(tr0, tr1, 0, 2, 4 + 0, 4 + 2); + tg0 = vsx_i32x4_shuffle(tg0, tg1, 0, 2, 4 + 0, 4 + 2); + tb0 = vsx_i32x4_shuffle(tb0, tb1, 0, 2, 4 + 0, 4 + 2); + + vsx_v128_store(rp, tr0); + vsx_v128_store(gp, tg0); + vsx_v128_store(bp, tb0); + + yp += 2; cbp += 2; crp += 2; + rp += 4; gp += 4; bp += 4; + } + } + } + + ////////////////////////////////////////////////////////////////////////// + void vsx_ict_forward(const float *r, const float *g, const float *b, + float *y, float *cb, float *cr, ui32 repeat) + { + v128_t alpha_rf = vsx_f32x4_splat(CT_CNST::ALPHA_RF); + v128_t alpha_gf = vsx_f32x4_splat(CT_CNST::ALPHA_GF); + v128_t alpha_bf = vsx_f32x4_splat(CT_CNST::ALPHA_BF); + v128_t beta_cbf = vsx_f32x4_splat(CT_CNST::BETA_CbF); + v128_t beta_crf = vsx_f32x4_splat(CT_CNST::BETA_CrF); + for (ui32 i = (repeat + 3) >> 2; i > 0; --i) + { + v128_t mr = vsx_v128_load(r); + v128_t mb = vsx_v128_load(b); + v128_t my = vsx_f32x4_mul(alpha_rf, mr); + my = vsx_f32x4_add(my, vsx_f32x4_mul(alpha_gf, vsx_v128_load(g))); + my = vsx_f32x4_add(my, vsx_f32x4_mul(alpha_bf, mb)); + vsx_v128_store(y, my); + vsx_v128_store(cb, vsx_f32x4_mul(beta_cbf, vsx_f32x4_sub(mb, my))); + vsx_v128_store(cr, vsx_f32x4_mul(beta_crf, vsx_f32x4_sub(mr, my))); + + r += 4; g += 4; b += 4; + y += 4; cb += 4; cr += 4; + } + } + + ////////////////////////////////////////////////////////////////////////// + void vsx_ict_backward(const float *y, const float *cb, const float *cr, + float *r, float *g, float *b, ui32 repeat) + { + v128_t gamma_cr2g = vsx_f32x4_splat(CT_CNST::GAMMA_CR2G); + v128_t gamma_cb2g = vsx_f32x4_splat(CT_CNST::GAMMA_CB2G); + v128_t gamma_cr2r = vsx_f32x4_splat(CT_CNST::GAMMA_CR2R); + v128_t gamma_cb2b = vsx_f32x4_splat(CT_CNST::GAMMA_CB2B); + for (ui32 i = (repeat + 3) >> 2; i > 0; --i) + { + v128_t my = vsx_v128_load(y); + v128_t mcr = vsx_v128_load(cr); + v128_t mcb = vsx_v128_load(cb); + v128_t mg = vsx_f32x4_sub(my, vsx_f32x4_mul(gamma_cr2g, mcr)); + vsx_v128_store(g, vsx_f32x4_sub(mg, vsx_f32x4_mul(gamma_cb2g, mcb))); + vsx_v128_store(r, vsx_f32x4_add(my, vsx_f32x4_mul(gamma_cr2r, mcr))); + vsx_v128_store(b, vsx_f32x4_add(my, vsx_f32x4_mul(gamma_cb2b, mcb))); + + y += 4; cb += 4; cr += 4; + r += 4; g += 4; b += 4; + } + } + + } +} diff --git a/src/core/transform/ojph_transform.cpp b/src/core/transform/ojph_transform.cpp index f67ea1b6..e587b13e 100644 --- a/src/core/transform/ojph_transform.cpp +++ b/src/core/transform/ojph_transform.cpp @@ -168,6 +168,21 @@ namespace ojph { #elif defined(OJPH_ARCH_ARM) + #elif defined(OJPH_ARCH_PPC64LE) + + if (get_cpu_ext_level() >= PPC_CPU_EXT_LEVEL_ARCH_3_00) + { + // 128-bit VSX kernels; see ojph_simd_vsx.h + rev_vert_step = vsx_rev_vert_step; + rev_horz_ana = vsx_rev_horz_ana; + rev_horz_syn = vsx_rev_horz_syn; + + irv_vert_step = vsx_irv_vert_step; + irv_vert_times_K = vsx_irv_vert_times_K; + irv_horz_ana = vsx_irv_horz_ana; + irv_horz_syn = vsx_irv_horz_syn; + } + #endif // !(defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386)) #endif // !OJPH_DISABLE_SIMD diff --git a/src/core/transform/ojph_transform_local.h b/src/core/transform/ojph_transform_local.h index acf9ee6d..4bc0c143 100644 --- a/src/core/transform/ojph_transform_local.h +++ b/src/core/transform/ojph_transform_local.h @@ -2,21 +2,21 @@ // This software is released under the 2-Clause BSD license, included // below. // -// Copyright (c) 2019, Aous Naman +// Copyright (c) 2019, Aous Naman // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia // Copyright (c) 2019, The University of New South Wales, Australia -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: -// +// // 1. Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. -// +// // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. -// +// // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A @@ -63,21 +63,21 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////// - void gen_irv_vert_step(const lifting_step* s, const line_buf* sig, - const line_buf* other, const line_buf* aug, + void gen_irv_vert_step(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, ui32 repeat, bool synthesis); ///////////////////////////////////////////////////////////////////////// void gen_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat); ///////////////////////////////////////////////////////////////////////// - void gen_irv_horz_ana(const param_atk* atk, const line_buf* ldst, - const line_buf* hdst, const line_buf* src, + void gen_irv_horz_ana(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, ui32 width, bool even); ///////////////////////////////////////////////////////////////////////// - void gen_irv_horz_syn(const param_atk *atk, const line_buf* dst, - const line_buf *lsrc, const line_buf *hsrc, + void gen_irv_horz_syn(const param_atk *atk, const line_buf* dst, + const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even); ////////////////////////////////////////////////////////////////////////// @@ -85,18 +85,18 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////// - void gen_rev_vert_step(const lifting_step* s, const line_buf* sig, - const line_buf* other, const line_buf* aug, + void gen_rev_vert_step(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, ui32 repeat, bool synthesis); ///////////////////////////////////////////////////////////////////////// - void gen_rev_horz_ana(const param_atk* atk, const line_buf* ldst, - const line_buf* hdst, const line_buf* src, + void gen_rev_horz_ana(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, ui32 width, bool even); ///////////////////////////////////////////////////////////////////////// - void gen_rev_horz_syn(const param_atk* atk, const line_buf* dst, - const line_buf* lsrc, const line_buf* hsrc, + void gen_rev_horz_syn(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, ui32 width, bool even); ////////////////////////////////////////////////////////////////////////// @@ -112,8 +112,8 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////// - void sse_irv_vert_step(const lifting_step* s, const line_buf* sig, - const line_buf* other, const line_buf* aug, + void sse_irv_vert_step(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, ui32 repeat, bool synthesis); ///////////////////////////////////////////////////////////////////////// @@ -121,12 +121,12 @@ namespace ojph { ///////////////////////////////////////////////////////////////////////// void sse_irv_horz_ana(const param_atk* atk, const line_buf* ldst, - const line_buf* hdst, const line_buf* src, + const line_buf* hdst, const line_buf* src, ui32 width, bool even); ///////////////////////////////////////////////////////////////////////// void sse_irv_horz_syn(const param_atk *atk, const line_buf* dst, - const line_buf *lsrc, const line_buf *hsrc, + const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even); ////////////////////////////////////////////////////////////////////////// @@ -142,18 +142,18 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////// - void sse2_rev_vert_step(const lifting_step* s, const line_buf* sig, - const line_buf* other, const line_buf* aug, + void sse2_rev_vert_step(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, ui32 repeat, bool synthesis); ///////////////////////////////////////////////////////////////////////// void sse2_rev_horz_ana(const param_atk* atk, const line_buf* ldst, - const line_buf* hdst, const line_buf* src, + const line_buf* hdst, const line_buf* src, ui32 width, bool even); ///////////////////////////////////////////////////////////////////////// void sse2_rev_horz_syn(const param_atk* atk, const line_buf* dst, - const line_buf* lsrc, const line_buf* hsrc, + const line_buf* lsrc, const line_buf* hsrc, ui32 width, bool even); @@ -170,8 +170,8 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////// - void avx_irv_vert_step(const lifting_step* s, const line_buf* sig, - const line_buf* other, const line_buf* aug, + void avx_irv_vert_step(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, ui32 repeat, bool synthesis); ///////////////////////////////////////////////////////////////////////// @@ -179,12 +179,12 @@ namespace ojph { ///////////////////////////////////////////////////////////////////////// void avx_irv_horz_ana(const param_atk* atk, const line_buf* ldst, - const line_buf* hdst, const line_buf* src, + const line_buf* hdst, const line_buf* src, ui32 width, bool even); ///////////////////////////////////////////////////////////////////////// void avx_irv_horz_syn(const param_atk *atk, const line_buf* dst, - const line_buf *lsrc, const line_buf *hsrc, + const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even); ////////////////////////////////////////////////////////////////////////// @@ -200,18 +200,18 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////// - void avx2_rev_vert_step(const lifting_step* s, const line_buf* sig, - const line_buf* other, const line_buf* aug, + void avx2_rev_vert_step(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, ui32 repeat, bool synthesis); ///////////////////////////////////////////////////////////////////////// void avx2_rev_horz_ana(const param_atk* atk, const line_buf* ldst, - const line_buf* hdst, const line_buf* src, + const line_buf* hdst, const line_buf* src, ui32 width, bool even); ///////////////////////////////////////////////////////////////////////// void avx2_rev_horz_syn(const param_atk* atk, const line_buf* dst, - const line_buf* lsrc, const line_buf* hsrc, + const line_buf* lsrc, const line_buf* hsrc, ui32 width, bool even); ////////////////////////////////////////////////////////////////////////// @@ -227,8 +227,8 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////// - void avx512_irv_vert_step(const lifting_step* s, const line_buf* sig, - const line_buf* other, const line_buf* aug, + void avx512_irv_vert_step(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, ui32 repeat, bool synthesis); ///////////////////////////////////////////////////////////////////////// @@ -236,12 +236,12 @@ namespace ojph { ///////////////////////////////////////////////////////////////////////// void avx512_irv_horz_ana(const param_atk* atk, const line_buf* ldst, - const line_buf* hdst, const line_buf* src, + const line_buf* hdst, const line_buf* src, ui32 width, bool even); ///////////////////////////////////////////////////////////////////////// void avx512_irv_horz_syn(const param_atk *atk, const line_buf* dst, - const line_buf *lsrc, const line_buf *hsrc, + const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even); @@ -251,17 +251,17 @@ namespace ojph { ///////////////////////////////////////////////////////////////////////// void avx512_rev_vert_step(const lifting_step* s, const line_buf* sig, - const line_buf* other, const line_buf* aug, + const line_buf* other, const line_buf* aug, ui32 repeat, bool synthesis); ///////////////////////////////////////////////////////////////////////// void avx512_rev_horz_ana(const param_atk* atk, const line_buf* ldst, - const line_buf* hdst, const line_buf* src, + const line_buf* hdst, const line_buf* src, ui32 width, bool even); ///////////////////////////////////////////////////////////////////////// void avx512_rev_horz_syn(const param_atk* atk, const line_buf* dst, - const line_buf* lsrc, const line_buf* hsrc, + const line_buf* lsrc, const line_buf* hsrc, ui32 width, bool even); ////////////////////////////////////////////////////////////////////////// @@ -277,8 +277,8 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////// - void wasm_irv_vert_step(const lifting_step* s, const line_buf* sig, - const line_buf* other, const line_buf* aug, + void wasm_irv_vert_step(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, ui32 repeat, bool synthesis); ///////////////////////////////////////////////////////////////////////// @@ -286,12 +286,12 @@ namespace ojph { ///////////////////////////////////////////////////////////////////////// void wasm_irv_horz_ana(const param_atk* atk, const line_buf* ldst, - const line_buf* hdst, const line_buf* src, + const line_buf* hdst, const line_buf* src, ui32 width, bool even); ///////////////////////////////////////////////////////////////////////// void wasm_irv_horz_syn(const param_atk *atk, const line_buf* dst, - const line_buf *lsrc, const line_buf *hsrc, + const line_buf *lsrc, const line_buf *hsrc, ui32 width, bool even); ////////////////////////////////////////////////////////////////////////// @@ -300,17 +300,65 @@ namespace ojph { ///////////////////////////////////////////////////////////////////////// void wasm_rev_vert_step(const lifting_step* s, const line_buf* sig, - const line_buf* other, const line_buf* aug, + const line_buf* other, const line_buf* aug, ui32 repeat, bool synthesis); ///////////////////////////////////////////////////////////////////////// void wasm_rev_horz_ana(const param_atk* atk, const line_buf* ldst, - const line_buf* hdst, const line_buf* src, + const line_buf* hdst, const line_buf* src, ui32 width, bool even); ///////////////////////////////////////////////////////////////////////// void wasm_rev_horz_syn(const param_atk* atk, const line_buf* dst, - const line_buf* lsrc, const line_buf* hsrc, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even); + ////////////////////////////////////////////////////////////////////////// + // + // + // VSX Functions + // + // + ////////////////////////////////////////////////////////////////////////// + + ////////////////////////////////////////////////////////////////////////// + // Irreversible functions + ////////////////////////////////////////////////////////////////////////// + + ///////////////////////////////////////////////////////////////////////// + void vsx_irv_vert_step(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis); + + ///////////////////////////////////////////////////////////////////////// + void vsx_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat); + + ///////////////////////////////////////////////////////////////////////// + void vsx_irv_horz_ana(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even); + + ///////////////////////////////////////////////////////////////////////// + void vsx_irv_horz_syn(const param_atk *atk, const line_buf* dst, + const line_buf *lsrc, const line_buf *hsrc, + ui32 width, bool even); + + ////////////////////////////////////////////////////////////////////////// + // Reversible functions + ////////////////////////////////////////////////////////////////////////// + + ///////////////////////////////////////////////////////////////////////// + void vsx_rev_vert_step(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis); + + ///////////////////////////////////////////////////////////////////////// + void vsx_rev_horz_ana(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even); + + ///////////////////////////////////////////////////////////////////////// + void vsx_rev_horz_syn(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, ui32 width, bool even); } } diff --git a/src/core/transform/ojph_transform_vsx.cpp b/src/core/transform/ojph_transform_vsx.cpp new file mode 100644 index 00000000..ba8e4d9b --- /dev/null +++ b/src/core/transform/ojph_transform_vsx.cpp @@ -0,0 +1,1310 @@ +//***************************************************************************/ +// This software is released under the 2-Clause BSD license, included +// below. +// +// Copyright (c) 2021, Aous Naman +// Copyright (c) 2021, Kakadu Software Pty Ltd, Australia +// Copyright (c) 2021, The University of New South Wales, Australia +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************/ +// This file is part of the OpenJPH software implementation. +// File: ojph_transform_vsx.cpp +// Author: Aous Naman +// Date: 09 February 2021 +//***************************************************************************/ + +#include +#include "ojph_simd_vsx.h" + +#include "ojph_defs.h" +#include "ojph_arch.h" +#include "ojph_mem.h" +#include "ojph_params.h" +#include "../codestream/ojph_params_local.h" + +#include "ojph_transform.h" +#include "ojph_transform_local.h" + +namespace ojph { + namespace local { + + ////////////////////////////////////////////////////////////////////////// + static inline + void vsx_deinterleave32(float* dpl, float* dph, float* sp, int width) + { + for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) + { + v128_t a = vsx_v128_load(sp); + v128_t b = vsx_v128_load(sp + 4); + v128_t c = vsx_i32x4_shuffle(a, b, 0, 2, 4 + 0, 4 + 2); + v128_t d = vsx_i32x4_shuffle(a, b, 1, 3, 4 + 1, 4 + 3); + // v128_t c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); + // v128_t d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1)); + vsx_v128_store(dpl, c); + vsx_v128_store(dph, d); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void vsx_interleave32(float* dp, float* spl, float* sph, int width) + { + for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) + { + v128_t a = vsx_v128_load(spl); + v128_t b = vsx_v128_load(sph); + v128_t c = vsx_i32x4_shuffle(a, b, 0, 4 + 0, 1, 4 + 1); + v128_t d = vsx_i32x4_shuffle(a, b, 2, 4 + 2, 3, 4 + 3); + // v128_t c = _mm_unpacklo_ps(a, b); + // v128_t d = _mm_unpackhi_ps(a, b); + vsx_v128_store(dp, c); + vsx_v128_store(dp + 4, d); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void vsx_deinterleave64(double* dpl, double* dph, double* sp, int width) + { + for (; width > 0; width -= 4, sp += 4, dpl += 2, dph += 2) + { + v128_t a = vsx_v128_load(sp); + v128_t b = vsx_v128_load(sp + 2); + v128_t c = vsx_i64x2_shuffle(a, b, 0, 2 + 0); + v128_t d = vsx_i64x2_shuffle(a, b, 1, 2 + 1); + vsx_v128_store(dpl, c); + vsx_v128_store(dph, d); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void vsx_interleave64(double* dp, double* spl, double* sph, int width) + { + for (; width > 0; width -= 4, dp += 4, spl += 2, sph += 2) + { + v128_t a = vsx_v128_load(spl); + v128_t b = vsx_v128_load(sph); + v128_t c = vsx_i64x2_shuffle(a, b, 0, 2 + 0); + v128_t d = vsx_i64x2_shuffle(a, b, 1, 2 + 1); + vsx_v128_store(dp, c); + vsx_v128_store(dp + 2, d); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline void vsx_multiply_const(float* p, float f, int width) + { + v128_t factor = vsx_f32x4_splat(f); + for (; width > 0; width -= 4, p += 4) + { + v128_t s = vsx_v128_load(p); + vsx_v128_store(p, vsx_f32x4_mul(factor, s)); + } + } + + ////////////////////////////////////////////////////////////////////////// + void vsx_irv_vert_step(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + float a = s->irv.Aatk; + if (synthesis) + a = -a; + + v128_t factor = vsx_f32x4_splat(a); + + float* dst = aug->f32; + const float* src1 = sig->f32, * src2 = other->f32; + int i = (int)repeat; + for ( ; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + v128_t s1 = vsx_v128_load(src1); + v128_t s2 = vsx_v128_load(src2); + v128_t d = vsx_v128_load(dst); + d = vsx_f32x4_add(d, vsx_f32x4_mul(factor, vsx_f32x4_add(s1, s2))); + vsx_v128_store(dst, d); + } + } + + ////////////////////////////////////////////////////////////////////////// + void vsx_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat) + { + vsx_multiply_const(aug->f32, K, (int)repeat); + } + + ///////////////////////////////////////////////////////////////////////// + void vsx_irv_horz_ana(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) + { + if (width > 1) + { + // split src into ldst and hdst + { + float* dpl = even ? ldst->f32 : hdst->f32; + float* dph = even ? hdst->f32 : ldst->f32; + float* sp = src->f32; + int w = (int)width; + vsx_deinterleave32(dpl, dph, sp, w); + } + + // the actual horizontal transform + float* hp = hdst->f32, * lp = ldst->f32; + ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = num_steps; j > 0; --j) + { + const lifting_step* s = atk->get_step(j - 1); + const float a = s->irv.Aatk; + + // extension + lp[-1] = lp[0]; + lp[l_width] = lp[l_width - 1]; + // lifting step + const float* sp = lp; + float* dp = hp; + int i = (int)h_width; + v128_t f = vsx_f32x4_splat(a); + if (even) + { + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + v128_t m = vsx_v128_load(sp); + v128_t n = vsx_v128_load(sp + 1); + v128_t p = vsx_v128_load(dp); + p = vsx_f32x4_add(p, vsx_f32x4_mul(f, vsx_f32x4_add(m, n))); + vsx_v128_store(dp, p); + } + } + else + { + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + v128_t m = vsx_v128_load(sp); + v128_t n = vsx_v128_load(sp - 1); + v128_t p = vsx_v128_load(dp); + p = vsx_f32x4_add(p, vsx_f32x4_mul(f, vsx_f32x4_add(m, n))); + vsx_v128_store(dp, p); + } + } + + // swap buffers + float* t = lp; lp = hp; hp = t; + even = !even; + ui32 w = l_width; l_width = h_width; h_width = w; + } + + { // multiply by K or 1/K + float K = atk->get_K(); + float K_inv = 1.0f / K; + vsx_multiply_const(lp, K_inv, (int)l_width); + vsx_multiply_const(hp, K, (int)h_width); + } + } + else { + if (even) + ldst->f32[0] = src->f32[0]; + else + hdst->f32[0] = src->f32[0] * 2.0f; + } + } + + ////////////////////////////////////////////////////////////////////////// + void vsx_irv_horz_syn(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (width > 1) + { + bool ev = even; + float* oth = hsrc->f32, * aug = lsrc->f32; + ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass + + { // multiply by K or 1/K + float K = atk->get_K(); + float K_inv = 1.0f / K; + vsx_multiply_const(aug, K, (int)aug_width); + vsx_multiply_const(oth, K_inv, (int)oth_width); + } + + // the actual horizontal transform + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = 0; j < num_steps; ++j) + { + const lifting_step* s = atk->get_step(j); + const float a = s->irv.Aatk; + + // extension + oth[-1] = oth[0]; + oth[oth_width] = oth[oth_width - 1]; + // lifting step + const float* sp = oth; + float* dp = aug; + int i = (int)aug_width; + v128_t f = vsx_f32x4_splat(a); + if (ev) + { + for ( ; i > 0; i -= 4, sp += 4, dp += 4) + { + v128_t m = vsx_v128_load(sp); + v128_t n = vsx_v128_load(sp - 1); + v128_t p = vsx_v128_load(dp); + p = vsx_f32x4_sub(p, vsx_f32x4_mul(f, vsx_f32x4_add(m, n))); + vsx_v128_store(dp, p); + } + } + else + { + for ( ; i > 0; i -= 4, sp += 4, dp += 4) + { + v128_t m = vsx_v128_load(sp); + v128_t n = vsx_v128_load(sp + 1); + v128_t p = vsx_v128_load(dp); + p = vsx_f32x4_sub(p, vsx_f32x4_mul(f, vsx_f32x4_add(m, n))); + vsx_v128_store(dp, p); + } + } + + // swap buffers + float* t = aug; aug = oth; oth = t; + ev = !ev; + ui32 w = aug_width; aug_width = oth_width; oth_width = w; + } + + // combine both lsrc and hsrc into dst + { + float* dp = dst->f32; + float* spl = even ? lsrc->f32 : hsrc->f32; + float* sph = even ? hsrc->f32 : lsrc->f32; + int w = (int)width; + vsx_interleave32(dp, spl, sph, w); + } + } + else { + if (even) + dst->f32[0] = lsrc->f32[0]; + else + dst->f32[0] = hsrc->f32[0] * 0.5f; + } + } + + ///////////////////////////////////////////////////////////////////////// + void vsx_rev_vert_step32(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + v128_t va = vsx_i32x4_splat(a); + v128_t vb = vsx_i32x4_splat(b); + + si32* dst = aug->i32; + const si32* src1 = sig->i32, * src2 = other->i32; + // The general definition of the wavelet in Part 2 is slightly + // different to part 2, although they are mathematically equivalent + // here, we identify the simpler form from Part 1 and employ them + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + v128_t s1 = vsx_v128_load((v128_t*)src1); + v128_t s2 = vsx_v128_load((v128_t*)src2); + v128_t d = vsx_v128_load((v128_t*)dst); + v128_t t = vsx_i32x4_add(s1, s2); + v128_t v = vsx_i32x4_add(vb, t); + v128_t w = vsx_i32x4_shr(v, e); + d = vsx_i32x4_sub(d, w); + vsx_v128_store((v128_t*)dst, d); + } + else + for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + v128_t s1 = vsx_v128_load((v128_t*)src1); + v128_t s2 = vsx_v128_load((v128_t*)src2); + v128_t d = vsx_v128_load((v128_t*)dst); + v128_t t = vsx_i32x4_add(s1, s2); + v128_t v = vsx_i32x4_add(vb, t); + v128_t w = vsx_i32x4_shr(v, e); + d = vsx_i32x4_add(d, w); + vsx_v128_store((v128_t*)dst, d); + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + v128_t s1 = vsx_v128_load((v128_t*)src1); + v128_t s2 = vsx_v128_load((v128_t*)src2); + v128_t d = vsx_v128_load((v128_t*)dst); + v128_t t = vsx_i32x4_add(s1, s2); + v128_t w = vsx_i32x4_shr(t, e); + d = vsx_i32x4_add(d, w); + vsx_v128_store((v128_t*)dst, d); + } + else + for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + v128_t s1 = vsx_v128_load((v128_t*)src1); + v128_t s2 = vsx_v128_load((v128_t*)src2); + v128_t d = vsx_v128_load((v128_t*)dst); + v128_t t = vsx_i32x4_add(s1, s2); + v128_t w = vsx_i32x4_shr(t, e); + d = vsx_i32x4_sub(d, w); + vsx_v128_store((v128_t*)dst, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + v128_t s1 = vsx_v128_load((v128_t*)src1); + v128_t s2 = vsx_v128_load((v128_t*)src2); + v128_t d = vsx_v128_load((v128_t*)dst); + v128_t t = vsx_i32x4_add(s1, s2); + v128_t v = vsx_i32x4_sub(vb, t); + v128_t w = vsx_i32x4_shr(v, e); + d = vsx_i32x4_sub(d, w); + vsx_v128_store((v128_t*)dst, d); + } + else + for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + v128_t s1 = vsx_v128_load((v128_t*)src1); + v128_t s2 = vsx_v128_load((v128_t*)src2); + v128_t d = vsx_v128_load((v128_t*)dst); + v128_t t = vsx_i32x4_add(s1, s2); + v128_t v = vsx_i32x4_sub(vb, t); + v128_t w = vsx_i32x4_shr(v, e); + d = vsx_i32x4_add(d, w); + vsx_v128_store((v128_t*)dst, d); + } + } + else + { // general case + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + v128_t s1 = vsx_v128_load((v128_t*)src1); + v128_t s2 = vsx_v128_load((v128_t*)src2); + v128_t d = vsx_v128_load((v128_t*)dst); + v128_t t = vsx_i32x4_add(s1, s2); + v128_t u = vsx_i32x4_mul(va, t); + v128_t v = vsx_i32x4_add(vb, u); + v128_t w = vsx_i32x4_shr(v, e); + d = vsx_i32x4_sub(d, w); + vsx_v128_store((v128_t*)dst, d); + } + else + for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + v128_t s1 = vsx_v128_load((v128_t*)src1); + v128_t s2 = vsx_v128_load((v128_t*)src2); + v128_t d = vsx_v128_load((v128_t*)dst); + v128_t t = vsx_i32x4_add(s1, s2); + v128_t u = vsx_i32x4_mul(va, t); + v128_t v = vsx_i32x4_add(vb, u); + v128_t w = vsx_i32x4_shr(v, e); + d = vsx_i32x4_add(d, w); + vsx_v128_store((v128_t*)dst, d); + } + } + } + + ///////////////////////////////////////////////////////////////////////// + void vsx_rev_vert_step64(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + v128_t va = vsx_i64x2_splat(a); + v128_t vb = vsx_i64x2_splat(b); + + si64* dst = aug->i64; + const si64* src1 = sig->i64, * src2 = other->i64; + // The general definition of the wavelet in Part 2 is slightly + // different to part 2, although they are mathematically equivalent + // here, we identify the simpler form from Part 1 and employ them + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = vsx_v128_load((v128_t*)src1); + v128_t s2 = vsx_v128_load((v128_t*)src2); + v128_t d = vsx_v128_load((v128_t*)dst); + v128_t t = vsx_i64x2_add(s1, s2); + v128_t v = vsx_i64x2_add(vb, t); + v128_t w = vsx_i64x2_shr(v, e); + d = vsx_i64x2_sub(d, w); + vsx_v128_store((v128_t*)dst, d); + } + else + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = vsx_v128_load((v128_t*)src1); + v128_t s2 = vsx_v128_load((v128_t*)src2); + v128_t d = vsx_v128_load((v128_t*)dst); + v128_t t = vsx_i64x2_add(s1, s2); + v128_t v = vsx_i64x2_add(vb, t); + v128_t w = vsx_i64x2_shr(v, e); + d = vsx_i64x2_add(d, w); + vsx_v128_store((v128_t*)dst, d); + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = vsx_v128_load((v128_t*)src1); + v128_t s2 = vsx_v128_load((v128_t*)src2); + v128_t d = vsx_v128_load((v128_t*)dst); + v128_t t = vsx_i64x2_add(s1, s2); + v128_t w = vsx_i64x2_shr(t, e); + d = vsx_i64x2_add(d, w); + vsx_v128_store((v128_t*)dst, d); + } + else + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = vsx_v128_load((v128_t*)src1); + v128_t s2 = vsx_v128_load((v128_t*)src2); + v128_t d = vsx_v128_load((v128_t*)dst); + v128_t t = vsx_i64x2_add(s1, s2); + v128_t w = vsx_i64x2_shr(t, e); + d = vsx_i64x2_sub(d, w); + vsx_v128_store((v128_t*)dst, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = vsx_v128_load((v128_t*)src1); + v128_t s2 = vsx_v128_load((v128_t*)src2); + v128_t d = vsx_v128_load((v128_t*)dst); + v128_t t = vsx_i64x2_add(s1, s2); + v128_t v = vsx_i64x2_sub(vb, t); + v128_t w = vsx_i64x2_shr(v, e); + d = vsx_i64x2_sub(d, w); + vsx_v128_store((v128_t*)dst, d); + } + else + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = vsx_v128_load((v128_t*)src1); + v128_t s2 = vsx_v128_load((v128_t*)src2); + v128_t d = vsx_v128_load((v128_t*)dst); + v128_t t = vsx_i64x2_add(s1, s2); + v128_t v = vsx_i64x2_sub(vb, t); + v128_t w = vsx_i64x2_shr(v, e); + d = vsx_i64x2_add(d, w); + vsx_v128_store((v128_t*)dst, d); + } + } + else + { // general case + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = vsx_v128_load((v128_t*)src1); + v128_t s2 = vsx_v128_load((v128_t*)src2); + v128_t d = vsx_v128_load((v128_t*)dst); + v128_t t = vsx_i64x2_add(s1, s2); + v128_t u = vsx_i64x2_mul(va, t); + v128_t v = vsx_i64x2_add(vb, u); + v128_t w = vsx_i64x2_shr(v, e); + d = vsx_i64x2_sub(d, w); + vsx_v128_store((v128_t*)dst, d); + } + else + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = vsx_v128_load((v128_t*)src1); + v128_t s2 = vsx_v128_load((v128_t*)src2); + v128_t d = vsx_v128_load((v128_t*)dst); + v128_t t = vsx_i64x2_add(s1, s2); + v128_t u = vsx_i64x2_mul(va, t); + v128_t v = vsx_i64x2_add(vb, u); + v128_t w = vsx_i64x2_shr(v, e); + d = vsx_i64x2_add(d, w); + vsx_v128_store((v128_t*)dst, d); + } + } + } + + ///////////////////////////////////////////////////////////////////////// + void vsx_rev_vert_step(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || + ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) || + ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) + { + assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) && + (other == NULL || other->flags & line_buf::LFT_32BIT) && + (aug == NULL || aug->flags & line_buf::LFT_32BIT)); + vsx_rev_vert_step32(s, sig, other, aug, repeat, synthesis); + } + else + { + assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) && + (other == NULL || other->flags & line_buf::LFT_64BIT) && + (aug == NULL || aug->flags & line_buf::LFT_64BIT)); + vsx_rev_vert_step64(s, sig, other, aug, repeat, synthesis); + } + } + + ///////////////////////////////////////////////////////////////////////// + static + void vsx_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) + { + if (width > 1) + { + // combine both lsrc and hsrc into dst + { + float* dpl = even ? ldst->f32 : hdst->f32; + float* dph = even ? hdst->f32 : ldst->f32; + float* sp = src->f32; + int w = (int)width; + vsx_deinterleave32(dpl, dph, sp, w); + } + + si32* hp = hdst->i32, * lp = ldst->i32; + ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = num_steps; j > 0; --j) + { + // first lifting step + const lifting_step* s = atk->get_step(j - 1); + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + v128_t va = vsx_i32x4_splat(a); + v128_t vb = vsx_i32x4_splat(b); + + // extension + lp[-1] = lp[0]; + lp[l_width] = lp[l_width - 1]; + // lifting step + const si32* sp = lp; + si32* dp = hp; + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)h_width; + if (even) + { + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp + 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i32x4_add(s1, s2); + v128_t v = vsx_i32x4_add(vb, t); + v128_t w = vsx_i32x4_shr(v, e); + d = vsx_i32x4_add(d, w); + vsx_v128_store((v128_t*)dp, d); + } + } + else + { + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp - 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i32x4_add(s1, s2); + v128_t v = vsx_i32x4_add(vb, t); + v128_t w = vsx_i32x4_shr(v, e); + d = vsx_i32x4_add(d, w); + vsx_v128_store((v128_t*)dp, d); + } + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)h_width; + if (even) + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp + 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i32x4_add(s1, s2); + v128_t w = vsx_i32x4_shr(t, e); + d = vsx_i32x4_sub(d, w); + vsx_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp - 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i32x4_add(s1, s2); + v128_t w = vsx_i32x4_shr(t, e); + d = vsx_i32x4_sub(d, w); + vsx_v128_store((v128_t*)dp, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)h_width; + if (even) + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp + 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i32x4_add(s1, s2); + v128_t v = vsx_i32x4_sub(vb, t); + v128_t w = vsx_i32x4_shr(v, e); + d = vsx_i32x4_add(d, w); + vsx_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp - 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i32x4_add(s1, s2); + v128_t v = vsx_i32x4_sub(vb, t); + v128_t w = vsx_i32x4_shr(v, e); + d = vsx_i32x4_add(d, w); + vsx_v128_store((v128_t*)dp, d); + } + } + else + { // general case + int i = (int)h_width; + if (even) + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp + 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i32x4_add(s1, s2); + v128_t u = vsx_i32x4_mul(va, t); + v128_t v = vsx_i32x4_add(vb, u); + v128_t w = vsx_i32x4_shr(v, e); + d = vsx_i32x4_add(d, w); + vsx_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp - 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i32x4_add(s1, s2); + v128_t u = vsx_i32x4_mul(va, t); + v128_t v = vsx_i32x4_add(vb, u); + v128_t w = vsx_i32x4_shr(v, e); + d = vsx_i32x4_add(d, w); + vsx_v128_store((v128_t*)dp, d); + } + } + + // swap buffers + si32* t = lp; lp = hp; hp = t; + even = !even; + ui32 w = l_width; l_width = h_width; h_width = w; + } + } + else { + if (even) + ldst->i32[0] = src->i32[0]; + else + hdst->i32[0] = src->i32[0] << 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + static + void vsx_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) + { + if (width > 1) + { + // combine both lsrc and hsrc into dst + { + double* dpl = (double*)(even ? ldst->p : hdst->p); + double* dph = (double*)(even ? hdst->p : ldst->p); + double* sp = (double*)src->p; + int w = (int)width; + vsx_deinterleave64(dpl, dph, sp, w); + } + + si64* hp = hdst->i64, * lp = ldst->i64; + ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = num_steps; j > 0; --j) + { + // first lifting step + const lifting_step* s = atk->get_step(j - 1); + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + v128_t va = vsx_i64x2_splat(a); + v128_t vb = vsx_i64x2_splat(b); + + // extension + lp[-1] = lp[0]; + lp[l_width] = lp[l_width - 1]; + // lifting step + const si64* sp = lp; + si64* dp = hp; + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)h_width; + if (even) + { + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp + 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i64x2_add(s1, s2); + v128_t v = vsx_i64x2_add(vb, t); + v128_t w = vsx_i64x2_shr(v, e); + d = vsx_i64x2_add(d, w); + vsx_v128_store((v128_t*)dp, d); + } + } + else + { + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp - 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i64x2_add(s1, s2); + v128_t v = vsx_i64x2_add(vb, t); + v128_t w = vsx_i64x2_shr(v, e); + d = vsx_i64x2_add(d, w); + vsx_v128_store((v128_t*)dp, d); + } + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)h_width; + if (even) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp + 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i64x2_add(s1, s2); + v128_t w = vsx_i64x2_shr(t, e); + d = vsx_i64x2_sub(d, w); + vsx_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp - 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i64x2_add(s1, s2); + v128_t w = vsx_i64x2_shr(t, e); + d = vsx_i64x2_sub(d, w); + vsx_v128_store((v128_t*)dp, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)h_width; + if (even) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp + 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i64x2_add(s1, s2); + v128_t v = vsx_i64x2_sub(vb, t); + v128_t w = vsx_i64x2_shr(v, e); + d = vsx_i64x2_add(d, w); + vsx_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp - 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i64x2_add(s1, s2); + v128_t v = vsx_i64x2_sub(vb, t); + v128_t w = vsx_i64x2_shr(v, e); + d = vsx_i64x2_add(d, w); + vsx_v128_store((v128_t*)dp, d); + } + } + else + { // general case + int i = (int)h_width; + if (even) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp + 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i64x2_add(s1, s2); + v128_t u = vsx_i64x2_mul(va, t); + v128_t v = vsx_i64x2_add(vb, u); + v128_t w = vsx_i64x2_shr(v, e); + d = vsx_i64x2_add(d, w); + vsx_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp - 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i64x2_add(s1, s2); + v128_t u = vsx_i64x2_mul(va, t); + v128_t v = vsx_i64x2_add(vb, u); + v128_t w = vsx_i64x2_shr(v, e); + d = vsx_i64x2_add(d, w); + vsx_v128_store((v128_t*)dp, d); + } + } + + // swap buffers + si64* t = lp; lp = hp; hp = t; + even = !even; + ui32 w = l_width; l_width = h_width; h_width = w; + } + } + else { + if (even) + ldst->i64[0] = src->i64[0]; + else + hdst->i64[0] = src->i64[0] << 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void vsx_rev_horz_ana(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) + { + if (src->flags & line_buf::LFT_32BIT) + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_32BIT)); + vsx_rev_horz_ana32(atk, ldst, hdst, src, width, even); + } + else + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && + (src == NULL || src->flags & line_buf::LFT_64BIT)); + vsx_rev_horz_ana64(atk, ldst, hdst, src, width, even); + } + } + + ////////////////////////////////////////////////////////////////////////// + void vsx_rev_horz_syn32(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (width > 1) + { + bool ev = even; + si32* oth = hsrc->i32, * aug = lsrc->i32; + ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = 0; j < num_steps; ++j) + { + const lifting_step* s = atk->get_step(j); + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + v128_t va = vsx_i32x4_splat(a); + v128_t vb = vsx_i32x4_splat(b); + + // extension + oth[-1] = oth[0]; + oth[oth_width] = oth[oth_width - 1]; + // lifting step + const si32* sp = oth; + si32* dp = aug; + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)aug_width; + if (ev) + { + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp - 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i32x4_add(s1, s2); + v128_t v = vsx_i32x4_add(vb, t); + v128_t w = vsx_i32x4_shr(v, e); + d = vsx_i32x4_sub(d, w); + vsx_v128_store((v128_t*)dp, d); + } + } + else + { + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp + 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i32x4_add(s1, s2); + v128_t v = vsx_i32x4_add(vb, t); + v128_t w = vsx_i32x4_shr(v, e); + d = vsx_i32x4_sub(d, w); + vsx_v128_store((v128_t*)dp, d); + } + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp - 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i32x4_add(s1, s2); + v128_t w = vsx_i32x4_shr(t, e); + d = vsx_i32x4_add(d, w); + vsx_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp + 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i32x4_add(s1, s2); + v128_t w = vsx_i32x4_shr(t, e); + d = vsx_i32x4_add(d, w); + vsx_v128_store((v128_t*)dp, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp - 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i32x4_add(s1, s2); + v128_t v = vsx_i32x4_sub(vb, t); + v128_t w = vsx_i32x4_shr(v, e); + d = vsx_i32x4_sub(d, w); + vsx_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp + 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i32x4_add(s1, s2); + v128_t v = vsx_i32x4_sub(vb, t); + v128_t w = vsx_i32x4_shr(v, e); + d = vsx_i32x4_sub(d, w); + vsx_v128_store((v128_t*)dp, d); + } + } + else + { // general case + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp - 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i32x4_add(s1, s2); + v128_t u = vsx_i32x4_mul(va, t); + v128_t v = vsx_i32x4_add(vb, u); + v128_t w = vsx_i32x4_shr(v, e); + d = vsx_i32x4_sub(d, w); + vsx_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp + 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i32x4_add(s1, s2); + v128_t u = vsx_i32x4_mul(va, t); + v128_t v = vsx_i32x4_add(vb, u); + v128_t w = vsx_i32x4_shr(v, e); + d = vsx_i32x4_sub(d, w); + vsx_v128_store((v128_t*)dp, d); + } + } + + // swap buffers + si32* t = aug; aug = oth; oth = t; + ev = !ev; + ui32 w = aug_width; aug_width = oth_width; oth_width = w; + } + + // combine both lsrc and hsrc into dst + { + float* dp = dst->f32; + float* spl = even ? lsrc->f32 : hsrc->f32; + float* sph = even ? hsrc->f32 : lsrc->f32; + int w = (int)width; + vsx_interleave32(dp, spl, sph, w); + } + } + else { + if (even) + dst->i32[0] = lsrc->i32[0]; + else + dst->i32[0] = hsrc->i32[0] >> 1; + } + } + + ////////////////////////////////////////////////////////////////////////// + void vsx_rev_horz_syn64(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (width > 1) + { + bool ev = even; + si64* oth = hsrc->i64, * aug = lsrc->i64; + ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = 0; j < num_steps; ++j) + { + const lifting_step* s = atk->get_step(j); + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + v128_t va = vsx_i64x2_splat(a); + v128_t vb = vsx_i64x2_splat(b); + + // extension + oth[-1] = oth[0]; + oth[oth_width] = oth[oth_width - 1]; + // lifting step + const si64* sp = oth; + si64* dp = aug; + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)aug_width; + if (ev) + { + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp - 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i64x2_add(s1, s2); + v128_t v = vsx_i64x2_add(vb, t); + v128_t w = vsx_i64x2_shr(v, e); + d = vsx_i64x2_sub(d, w); + vsx_v128_store((v128_t*)dp, d); + } + } + else + { + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp + 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i64x2_add(s1, s2); + v128_t v = vsx_i64x2_add(vb, t); + v128_t w = vsx_i64x2_shr(v, e); + d = vsx_i64x2_sub(d, w); + vsx_v128_store((v128_t*)dp, d); + } + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp - 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i64x2_add(s1, s2); + v128_t w = vsx_i64x2_shr(t, e); + d = vsx_i64x2_add(d, w); + vsx_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp + 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i64x2_add(s1, s2); + v128_t w = vsx_i64x2_shr(t, e); + d = vsx_i64x2_add(d, w); + vsx_v128_store((v128_t*)dp, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp - 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i64x2_add(s1, s2); + v128_t v = vsx_i64x2_sub(vb, t); + v128_t w = vsx_i64x2_shr(v, e); + d = vsx_i64x2_sub(d, w); + vsx_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp + 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i64x2_add(s1, s2); + v128_t v = vsx_i64x2_sub(vb, t); + v128_t w = vsx_i64x2_shr(v, e); + d = vsx_i64x2_sub(d, w); + vsx_v128_store((v128_t*)dp, d); + } + } + else + { // general case + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp - 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i64x2_add(s1, s2); + v128_t u = vsx_i64x2_mul(va, t); + v128_t v = vsx_i64x2_add(vb, u); + v128_t w = vsx_i64x2_shr(v, e); + d = vsx_i64x2_sub(d, w); + vsx_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = vsx_v128_load((v128_t*)sp); + v128_t s2 = vsx_v128_load((v128_t*)(sp + 1)); + v128_t d = vsx_v128_load((v128_t*)dp); + v128_t t = vsx_i64x2_add(s1, s2); + v128_t u = vsx_i64x2_mul(va, t); + v128_t v = vsx_i64x2_add(vb, u); + v128_t w = vsx_i64x2_shr(v, e); + d = vsx_i64x2_sub(d, w); + vsx_v128_store((v128_t*)dp, d); + } + } + + // swap buffers + si64* t = aug; aug = oth; oth = t; + ev = !ev; + ui32 w = aug_width; aug_width = oth_width; oth_width = w; + } + + // combine both lsrc and hsrc into dst + { + double* dp = (double*)dst->p; + double* spl = (double*)(even ? lsrc->p : hsrc->p); + double* sph = (double*)(even ? hsrc->p : lsrc->p); + int w = (int)width; + vsx_interleave64(dp, spl, sph, w); + } + } + else { + if (even) + dst->i64[0] = lsrc->i64[0]; + else + dst->i64[0] = hsrc->i64[0] >> 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void vsx_rev_horz_syn(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (dst->flags & line_buf::LFT_32BIT) + { + assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT)); + vsx_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even); + } + else + { + assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) && + (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT)); + vsx_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even); + } + } + + } // !local +} // !ojph