diff --git a/FunKey/board/funkey/busybox.config b/FunKey/board/funkey/busybox.config index 833107d..d184ec3 100644 --- a/FunKey/board/funkey/busybox.config +++ b/FunKey/board/funkey/busybox.config @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit # Busybox version: 1.32.0 -# Thu Dec 24 11:17:18 2020 +# Sun Dec 27 00:26:32 2020 # CONFIG_HAVE_DOT_CONFIG=y diff --git a/FunKey/configs/funkey_defconfig b/FunKey/configs/funkey_defconfig index 5d4a42b..8080b17 100644 --- a/FunKey/configs/funkey_defconfig +++ b/FunKey/configs/funkey_defconfig @@ -6,13 +6,13 @@ BR2_CCACHE=y BR2_OPTIMIZE_FAST=y BR2_SHARED_STATIC_LIBS=y BR2_GLOBAL_PATCH_DIR="$(BR2_EXTERNAL_FUNKEY_PATH)/board/funkey/patches" -BR2_TOOLCHAIN_BUILDROOT_VENDOR="funkey" -BR2_TOOLCHAIN_BUILDROOT_MUSL=y -BR2_PACKAGE_HOST_LINUX_HEADERS_CUSTOM_4_14=y -BR2_BINUTILS_VERSION_2_35_X=y -BR2_GCC_VERSION_10_X=y -BR2_TOOLCHAIN_BUILDROOT_CXX=y -BR2_GCC_ENABLE_LTO=y +BR2_TOOLCHAIN_EXTERNAL=y +BR2_TOOLCHAIN_EXTERNAL_CUSTOM=y +BR2_TOOLCHAIN_EXTERNAL_DOWNLOAD=y +BR2_TOOLCHAIN_EXTERNAL_URL="https://github.com/FunKey-Project/FunKey-OS/releases/download/FunKey-OS-1.1.0/FunKey-sdk-1.1.0.tar.gz" +BR2_TOOLCHAIN_EXTERNAL_HEADERS_4_14=y +BR2_TOOLCHAIN_EXTERNAL_CUSTOM_MUSL=y +BR2_TOOLCHAIN_EXTERNAL_CXX=y BR2_TARGET_OPTIMIZATION="-fno-PIC -march=armv7-a+neon-vfpv4 -mtune=cortex-a7 -mfpu=neon-vfpv4 -mvectorize-with-neon-quad" BR2_TARGET_GENERIC_HOSTNAME="FunKey" BR2_TARGET_GENERIC_ISSUE="Welcome to Buildroot for the FunKey" diff --git a/Makefile b/Makefile index 83cf861..007530d 100644 --- a/Makefile +++ b/Makefile @@ -57,6 +57,28 @@ fun: buildroot Recovery/output/.config FunKey/output/.config @$(call MESSAGE,"Making fun in FunKey") @$(BRMAKE) BR2_EXTERNAL=../FunKey O=../FunKey/output +sdk: buildroot SDK/output/.config + @$(call MESSAGE,"Making FunKey SDK") + @$(BRMAKE) BR2_EXTERNAL=../SDK O=../SDK/output prepare-sdk + @$(call MESSAGE,"Generating SDK tarball") + @cd SDK/output/host; \ + PWD=$(shell pwd); \ + export LC_ALL=C; \ + SDK=FunKey-sdk-$(shell cat FunKey/board/funkey/rootfs-overlay/etc/sw-versions | cut -f 2); \ + grep -lr "$(shell pwd)/SDK/output/host" . | while read -r FILE ; do \ + if file -b --mime-type "$${FILE}" | grep -q '^text/'; then \ + sed -i "s|$(shell pwd)/SDK/output/host|/opt/$${SDK}|g" "$${FILE}"; \ + fi; \ + done; \ + cd $(shell pwd); \ + tar czf "images/$${SDK}.tar.gz" \ + --owner=0 --group=0 --numeric-owner \ + --transform="s#^$(patsubst /%,%,$(shell pwd))/SDK/output/host#$${SDK}#" \ + -C / "$(patsubst /%,%,$(shell pwd))/SDK/output/host"; \ + rm -f download/toolchain-external-custom/$${SDK}.tar.gz; \ + mkdir -p download/toolchain-external-custom; \ + ln -s ../../images/$${SDK}.tar.gz download/toolchain-external-custom/ + FunKey/%: FunKey/output/.config @$(call MESSAGE,"Making $(notdir $@) in $(subst /,,$(dir $@))") @$(BR) BR2_EXTERNAL=../FunKey O=../FunKey/output $(notdir $@) @@ -65,12 +87,17 @@ Recovery/%: Recovery/output/.config @$(call MESSAGE,"Making $(notdir $@) in $(subst /,,$(dir $@))") @$(BR) BR2_EXTERNAL=../Recovery O=../Recovery/output $(notdir $@) +SDK/%: SDK/output/.config + @$(call MESSAGE,"Making $(notdir $@) in $(subst /,,$(dir $@))") + @$(BR) BR2_EXTERNAL=../SDK O=../SDK/output $(notdir $@) + #%: FunKey/output/.config # @$(call MESSAGE,"Making $@ in FunKey") # @$(BR) BR2_EXTERNAL=../FunKey O=../FunKey/output $@ source: @$(call MESSAGE,"Getting sources") + @$(BR) BR2_EXTERNAL=../SDK O=../SDK/output source @$(BR) BR2_EXTERNAL=../Recovery O=../Recovery/output source @$(BR) BR2_EXTERNAL=../FunKey O=../FunKey/output source @@ -108,6 +135,8 @@ update: fun defconfig: @$(call MESSAGE,"Updating default configs") + @$(call MESSAGE,"Updating default configs in SDK") + @$(BR) BR2_EXTERNAL=../SDK O=../SDK/output savedefconfig @$(call MESSAGE,"Updating default configs in Recovery") @$(BR) BR2_EXTERNAL=../Recovery O=../Recovery/output savedefconfig linux-update-defconfig uboot-update-defconfig busybox-update-config @$(call MESSAGE,"Updating default configs in FunKey") @@ -115,6 +144,7 @@ defconfig: clean: @$(call MESSAGE,"Clean everything") + @$(BR) BR2_EXTERNAL=../SDK O=../SDK/output distclean @$(BR) BR2_EXTERNAL=../Recovery O=../Recovery/output distclean @$(BR) BR2_EXTERNAL=../FunKey O=../FunKey/output distclean @rm -f br.log @@ -132,3 +162,8 @@ Recovery/output/.config: @$(call MESSAGE,"Configure Recovery") @mkdir -p Recovery/board/funkey/patches @$(BR) BR2_EXTERNAL=../Recovery O=../Recovery/output recovery_defconfig + +SDK/output/.config: + @$(call MESSAGE,"Configure SDK") + @mkdir -p SDK/board/funkey/patches + @$(BR) BR2_EXTERNAL=../SDK O=../SDK/output funkey_defconfig diff --git a/Recovery/board/funkey/busybox.config b/Recovery/board/funkey/busybox.config index 7327966..d556bd9 100644 --- a/Recovery/board/funkey/busybox.config +++ b/Recovery/board/funkey/busybox.config @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit # Busybox version: 1.32.0 -# Thu Dec 24 10:53:30 2020 +# Sun Dec 27 00:06:13 2020 # CONFIG_HAVE_DOT_CONFIG=y diff --git a/Recovery/configs/recovery_defconfig b/Recovery/configs/recovery_defconfig index 3477c5e..8dd3945 100644 --- a/Recovery/configs/recovery_defconfig +++ b/Recovery/configs/recovery_defconfig @@ -6,13 +6,13 @@ BR2_CCACHE=y BR2_OPTIMIZE_FAST=y BR2_SHARED_STATIC_LIBS=y BR2_GLOBAL_PATCH_DIR="$(BR2_EXTERNAL_RECOVERY_PATH)/board/funkey/patches" -BR2_TOOLCHAIN_BUILDROOT_VENDOR="funkey" -BR2_TOOLCHAIN_BUILDROOT_MUSL=y -BR2_PACKAGE_HOST_LINUX_HEADERS_CUSTOM_4_14=y -BR2_BINUTILS_VERSION_2_35_X=y -BR2_GCC_VERSION_10_X=y -BR2_TOOLCHAIN_BUILDROOT_CXX=y -BR2_GCC_ENABLE_LTO=y +BR2_TOOLCHAIN_EXTERNAL=y +BR2_TOOLCHAIN_EXTERNAL_CUSTOM=y +BR2_TOOLCHAIN_EXTERNAL_DOWNLOAD=y +BR2_TOOLCHAIN_EXTERNAL_URL="https://github.com/FunKey-Project/FunKey-OS/releases/download/FunKey-OS-1.1.0/FunKey-sdk-1.1.0.tar.gz" +BR2_TOOLCHAIN_EXTERNAL_HEADERS_4_14=y +BR2_TOOLCHAIN_EXTERNAL_CUSTOM_MUSL=y +BR2_TOOLCHAIN_EXTERNAL_CXX=y BR2_TARGET_OPTIMIZATION="-fno-PIC -march=armv7-a+neon-vfpv4 -mtune=cortex-a7 -mfpu=neon-vfpv4" BR2_TARGET_GENERIC_HOSTNAME="FunKey" BR2_TARGET_GENERIC_ISSUE="Welcome to Recovery Buildroot for the FunKey" diff --git a/SDK/Config.in b/SDK/Config.in new file mode 100644 index 0000000..984b74f --- /dev/null +++ b/SDK/Config.in @@ -0,0 +1,7 @@ +source "$BR2_EXTERNAL_SDK_PATH/package/dmtx-utils/Config.in" +source "$BR2_EXTERNAL_SDK_PATH/package/libini/Config.in" +source "$BR2_EXTERNAL_SDK_PATH/package/libopk/Config.in" +source "$BR2_EXTERNAL_SDK_PATH/package/libxdgmime/Config.in" +source "$BR2_EXTERNAL_SDK_PATH/package/agg/Config.in" +source "$BR2_EXTERNAL_SDK_PATH/package/fluidlite/Config.in" +source "$BR2_EXTERNAL_SDK_PATH/package/libmikmod/Config.in" diff --git a/SDK/board/funkey/patches/pixman/0002-aarch64NEON_asm.patch b/SDK/board/funkey/patches/pixman/0002-aarch64NEON_asm.patch new file mode 100644 index 0000000..1f27440 --- /dev/null +++ b/SDK/board/funkey/patches/pixman/0002-aarch64NEON_asm.patch @@ -0,0 +1,6526 @@ +From 7b128ae8c56b1055a93573004148a98465d79857 Mon Sep 17 00:00:00 2001 +From: Mizuki Asakura +Date: Sun, 17 Apr 2016 20:16:12 +0900 +Subject: [PATCH] [mod] added aarch64 bilinear implementations (ver.4.1) + +Since aarch64 has different neon syntax from aarch32 and has no +support for (older) arm-simd, +there are no SIMD accelerations for pixman on aarch64. + +We need new implementations. + +This patch also contains Ben Avions's series of patches for aarch32 +and now the benchmark results are fine to aarch64. + +Please find the result at the below ticket. + +Added: https://bugs.freedesktop.org/show_bug.cgi?id=94758 +Signed-off-by: Mizuki Asakura +--- + configure.ac | 36 +- + pixman/Makefile.am | 17 +- + pixman/pixman-arm-neon.c | 23 +- + pixman/pixman-arm.c | 6 + + pixman/pixman-arma64-neon-asm-bilinear.S | 1275 ++++++++++ + pixman/pixman-arma64-neon-asm.S | 3704 ++++++++++++++++++++++++++++++ + pixman/pixman-arma64-neon-asm.h | 1310 +++++++++++ + pixman/pixman-private.h | 7 +- + 8 files changed, 6374 insertions(+), 4 deletions(-) + create mode 100644 pixman/pixman-arma64-neon-asm-bilinear.S + create mode 100644 pixman/pixman-arma64-neon-asm.S + create mode 100644 pixman/pixman-arma64-neon-asm.h + +diff --git a/configure.ac b/configure.ac +index 6b2134e..26203a8 +--- a/configure.ac ++++ b/configure.ac +@@ -667,6 +667,40 @@ if test $enable_arm_neon = yes && test $have_arm_neon = no ; then + AC_MSG_ERROR([ARM NEON intrinsics not detected]) + fi + ++dnl ========================================================================== ++dnl Check if assembler is gas compatible and supports ARM-a64 NEON instructions ++have_arm_a64_neon=no ++AC_MSG_CHECKING(whether to use ARM A64 NEON assembler) ++xserver_save_CFLAGS=$CFLAGS ++CFLAGS="-x assembler-with-cpp $CFLAGS" ++AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ ++.text ++.arch armv8-a ++.altmacro ++prfm pldl2strm, [x0] ++xtn v0.8b, v0.8h]])], have_arm_a64_neon=yes) ++CFLAGS=$xserver_save_CFLAGS ++ ++AC_ARG_ENABLE(arm-a64-neon, ++ [AC_HELP_STRING([--disable-arm-a64-neon], ++ [disable ARM A64 NEON fast paths])], ++ [enable_arm_a64_neon=$enableval], [enable_arm_a64_neon=auto]) ++ ++if test $enable_arm_a64_neon = no ; then ++ have_arm_a64_neon=disabled ++fi ++ ++if test $have_arm_a64_neon = yes ; then ++ AC_DEFINE(USE_ARM_A64_NEON, 1, [use ARM A64_NEON assembly optimizations]) ++fi ++ ++AM_CONDITIONAL(USE_ARM_A64_NEON, test $have_arm_a64_neon = yes) ++ ++AC_MSG_RESULT($have_arm_a64_neon) ++if test $enable_arm_a64_neon = yes && test $have_arm_a64_neon4 = no ; then ++ AC_MSG_ERROR([ARM A64 NEON intrinsics not detected]) ++fi ++ + dnl =========================================================================== + dnl Check for IWMMXT + +diff --git a/pixman/Makefile.am b/pixman/Makefile.am +index 581b6f6..f1afa27 +--- a/pixman/Makefile.am ++++ b/pixman/Makefile.am +@@ -94,6 +94,21 @@ libpixman_1_la_LIBADD += libpixman-arm-neon.la + ASM_CFLAGS_arm_neon= + endif + ++# arm a64 neon code ++if USE_ARM_A64_NEON ++noinst_LTLIBRARIES += libpixman-arma64-neon.la ++libpixman_arma64_neon_la_SOURCES = \ ++ pixman-arm-neon.c \ ++ pixman-arm-common.h \ ++ pixman-arma64-neon-asm.S \ ++ pixman-arma64-neon-asm-bilinear.S \ ++ pixman-arm-asm.h \ ++ pixman-arma64-neon-asm.h ++libpixman_1_la_LIBADD += libpixman-arma64-neon.la ++ ++ASM_CFLAGS_arm_neon= ++endif ++ + # iwmmxt code + if USE_ARM_IWMMXT + libpixman_iwmmxt_la_SOURCES = pixman-mmx.c +diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c +index be761c9..62c9442 100644 +--- a/pixman/pixman-arm-neon.c ++++ b/pixman/pixman-arm-neon.c +@@ -194,7 +194,7 @@ arm_neon_fill (pixman_implementation_t *imp, + uint32_t _xor) + { + /* stride is always multiple of 32bit units in pixman */ +- uint32_t byte_stride = stride * sizeof(uint32_t); ++ int32_t byte_stride = stride * sizeof(uint32_t); + + switch (bpp) + { +@@ -331,6 +331,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] = + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, b5g6r5, neon_composite_over_8888_8_0565), + PIXMAN_STD_FAST_PATH (OVER, r5g6b5, a8, r5g6b5, neon_composite_over_0565_8_0565), + PIXMAN_STD_FAST_PATH (OVER, b5g6r5, a8, b5g6r5, neon_composite_over_0565_8_0565), ++ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, x8r8g8b8, neon_composite_over_8888_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_over_8888_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, neon_composite_over_8888_0565), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, neon_composite_over_8888_0565), +@@ -341,17 +342,33 @@ static const pixman_fast_path_t arm_neon_fast_paths[] = + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, a8r8g8b8, neon_composite_src_x888_8888), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, a8b8g8r8, neon_composite_src_x888_8888), + PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, neon_composite_add_n_8_8), ++ PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, neon_composite_add_n_8_8888), + PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, neon_composite_add_n_8_8888), ++ PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, neon_composite_add_n_8_8888), + PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, neon_composite_add_n_8_8888), + PIXMAN_STD_FAST_PATH (ADD, a8, a8, a8, neon_composite_add_8_8_8), + PIXMAN_STD_FAST_PATH (ADD, r5g6b5, a8, r5g6b5, neon_composite_add_0565_8_0565), + PIXMAN_STD_FAST_PATH (ADD, b5g6r5, a8, b5g6r5, neon_composite_add_0565_8_0565), ++ PIXMAN_STD_FAST_PATH (ADD, x8r8g8b8, a8, x8r8g8b8, neon_composite_add_8888_8_8888), ++ PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, a8, x8r8g8b8, neon_composite_add_8888_8_8888), ++ PIXMAN_STD_FAST_PATH (ADD, x8b8g8r8, a8, x8b8g8r8, neon_composite_add_8888_8_8888), ++ PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, a8, x8b8g8r8, neon_composite_add_8888_8_8888), + PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, a8, a8r8g8b8, neon_composite_add_8888_8_8888), + PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, a8, a8b8g8r8, neon_composite_add_8888_8_8888), ++ PIXMAN_STD_FAST_PATH (ADD, x8r8g8b8, a8r8g8b8, x8r8g8b8, neon_composite_add_8888_8888_8888), ++ PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, x8r8g8b8, neon_composite_add_8888_8888_8888), + PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_add_8888_8888_8888), ++ PIXMAN_STD_FAST_PATH (ADD, x8r8g8b8, solid, x8r8g8b8, neon_composite_add_8888_n_8888), ++ PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, solid, x8r8g8b8, neon_composite_add_8888_n_8888), ++ PIXMAN_STD_FAST_PATH (ADD, x8b8g8r8, solid, x8b8g8r8, neon_composite_add_8888_n_8888), ++ PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, solid, x8b8g8r8, neon_composite_add_8888_n_8888), + PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, solid, a8r8g8b8, neon_composite_add_8888_n_8888), + PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, solid, a8b8g8r8, neon_composite_add_8888_n_8888), + PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, neon_composite_add_8_8), ++ PIXMAN_STD_FAST_PATH (ADD, x8r8g8b8, null, x8r8g8b8, neon_composite_add_8888_8888), ++ PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, x8r8g8b8, neon_composite_add_8888_8888), ++ PIXMAN_STD_FAST_PATH (ADD, x8b8g8r8, null, x8b8g8r8, neon_composite_add_8888_8888), ++ PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, x8b8g8r8, neon_composite_add_8888_8888), + PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, neon_composite_add_8888_8888), + PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, neon_composite_add_8888_8888), + PIXMAN_STD_FAST_PATH (IN, solid, null, a8, neon_composite_in_n_8), +@@ -359,7 +376,9 @@ static const pixman_fast_path_t arm_neon_fast_paths[] = + PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, neon_composite_over_reverse_n_8888), + PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, r5g6b5, neon_composite_out_reverse_8_0565), + PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, b5g6r5, neon_composite_out_reverse_8_0565), ++ PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, x8r8g8b8, neon_composite_out_reverse_8_8888), + PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, a8r8g8b8, neon_composite_out_reverse_8_8888), ++ PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, x8b8g8r8, neon_composite_out_reverse_8_8888), + PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8, null, a8b8g8r8, neon_composite_out_reverse_8_8888), + + SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888), +@@ -404,6 +423,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] = + + SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8888), + SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8888), ++ SIMPLE_BILINEAR_FAST_PATH (ADD, x8r8g8b8, x8r8g8b8, neon_8888_8888), + + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, neon_8888_8_8888), + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8_8888), +@@ -420,6 +440,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] = + + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8_8888), + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8_8888), ++ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, x8r8g8b8, x8r8g8b8, neon_8888_8_8888), + + { PIXMAN_OP_NONE }, + }; +diff --git a/pixman/pixman-arm.c b/pixman/pixman-arm.c +index 23374e4..734cbea 100644 +--- a/pixman/pixman-arm.c ++++ b/pixman/pixman-arm.c +@@ -221,5 +221,11 @@ _pixman_arm_get_implementations (pixman_implementation_t *imp) + imp = _pixman_implementation_create_arm_neon (imp); + #endif + ++#ifdef USE_ARM_A64_NEON ++ /* neon is a part of aarch64 */ ++ if (!_pixman_disabled ("arm-neon")) ++ imp = _pixman_implementation_create_arm_neon (imp); ++#endif ++ + return imp; + } +diff --git a/pixman/pixman-arma64-neon-asm-bilinear.S b/pixman/pixman-arma64-neon-asm-bilinear.S +new file mode 100644 +index 0000000..aaa4a83 +--- /dev/null ++++ b/pixman/pixman-arma64-neon-asm-bilinear.S +@@ -0,0 +1,1275 @@ ++/* ++ * Copyright © 2011 SCore Corporation ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++ * DEALINGS IN THE SOFTWARE. ++ * ++ * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) ++ * Author: Taekyun Kim (tkq.kim@samsung.com) ++ */ ++ ++/* ++ * This file contains scaled bilinear scanline functions implemented ++ * using older siarhei's bilinear macro template. ++ * ++ * << General scanline function procedures >> ++ * 1. bilinear interpolate source pixels ++ * 2. load mask pixels ++ * 3. load destination pixels ++ * 4. duplicate mask to fill whole register ++ * 5. interleave source & destination pixels ++ * 6. apply mask to source pixels ++ * 7. combine source & destination pixels ++ * 8, Deinterleave final result ++ * 9. store destination pixels ++ * ++ * All registers with single number (i.e. src0, tmp0) are 64-bits registers. ++ * Registers with double numbers(src01, dst01) are 128-bits registers. ++ * All temp registers can be used freely outside the code block. ++ * Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks. ++ * ++ * Remarks ++ * There can be lots of pipeline stalls inside code block and between code blocks. ++ * Further optimizations will be done by new macro templates using head/tail_head/tail scheme. ++ */ ++ ++/* Prevent the stack from becoming executable for no reason... */ ++#if defined(__linux__) && defined (__ELF__) ++.section .note.GNU-stack,"",%progbits ++#endif ++ ++.text ++.arch armv8-a ++.altmacro ++.p2align 2 ++ ++#include "pixman-private.h" ++#include "pixman-arm-asm.h" ++#include "pixman-arma64-neon-asm.h" ++ ++/* ++ * Bilinear macros from pixman-arm-neon-asm.S ++ */ ++ ++/* ++ * Bilinear scaling support code which tries to provide pixel fetching, color ++ * format conversion, and interpolation as separate macros which can be used ++ * as the basic building blocks for constructing bilinear scanline functions. ++ */ ++ ++.macro bilinear_load_8888 reg1, reg2, tmp ++ asr WTMP1, X, #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, lsl #2 ++ ld1 {®1&.2s}, [TMP1], STRIDE ++ ld1 {®2&.2s}, [TMP1] ++.endm ++ ++.macro bilinear_load_0565 reg1, reg2, tmp ++ asr WTMP1, X, #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, lsl #1 ++ ld1 {®2&.s}[0], [TMP1], STRIDE ++ ld1 {®2&.s}[1], [TMP1] ++ convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp ++.endm ++ ++.macro bilinear_load_and_vertical_interpolate_two_8888 \ ++ acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 ++ ++ bilinear_load_8888 reg1, reg2, tmp1 ++ umull &acc1&.8h, ®1&.8b, v28.8b ++ umlal &acc1&.8h, ®2&.8b, v29.8b ++ bilinear_load_8888 reg3, reg4, tmp2 ++ umull &acc2&.8h, ®3&.8b, v28.8b ++ umlal &acc2&.8h, ®4&.8b, v29.8b ++.endm ++ ++.macro bilinear_load_and_vertical_interpolate_four_8888 \ ++ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ ++ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi ++ ++ bilinear_load_and_vertical_interpolate_two_8888 \ ++ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi ++ bilinear_load_and_vertical_interpolate_two_8888 \ ++ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi ++.endm ++ ++.macro vzip reg1, reg2 ++ zip1 v24.8b, reg1, reg2 ++ zip2 reg2, reg1, reg2 ++ mov reg1, v24.8b ++.endm ++ ++.macro vuzp reg1, reg2 ++ uzp1 v24.8b, reg1, reg2 ++ uzp2 reg2, reg1, reg2 ++ mov reg1, v24.8b ++.endm ++ ++.macro bilinear_load_and_vertical_interpolate_two_0565 \ ++ acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi ++ asr WTMP1, X, #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, lsl #1 ++ asr WTMP2, X, #16 ++ add X, X, UX ++ add TMP2, TOP, TMP2, lsl #1 ++ ld1 {&acc2&.s}[0], [TMP1], STRIDE ++ ld1 {&acc2&.s}[2], [TMP2], STRIDE ++ ld1 {&acc2&.s}[1], [TMP1] ++ ld1 {&acc2&.s}[3], [TMP2] ++ convert_0565_to_x888 acc2, reg3, reg2, reg1 ++ vzip ®1&.8b, ®3&.8b ++ vzip ®2&.8b, ®4&.8b ++ vzip ®3&.8b, ®4&.8b ++ vzip ®1&.8b, ®2&.8b ++ umull &acc1&.8h, ®1&.8b, v28.8b ++ umlal &acc1&.8h, ®2&.8b, v29.8b ++ umull &acc2&.8h, ®3&.8b, v28.8b ++ umlal &acc2&.8h, ®4&.8b, v29.8b ++.endm ++ ++.macro bilinear_load_and_vertical_interpolate_four_0565 \ ++ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ ++ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi ++ ++ asr WTMP1, X, #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, lsl #1 ++ asr WTMP2, X, #16 ++ add X, X, UX ++ add TMP2, TOP, TMP2, lsl #1 ++ ld1 {&xacc2&.s}[0], [TMP1], STRIDE ++ ld1 {&xacc2&.s}[2], [TMP2], STRIDE ++ ld1 {&xacc2&.s}[1], [TMP1] ++ ld1 {&xacc2&.s}[3], [TMP2] ++ convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 ++ asr WTMP1, X, #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, lsl #1 ++ asr WTMP2, X, #16 ++ add X, X, UX ++ add TMP2, TOP, TMP2, lsl #1 ++ ld1 {&yacc2&.s}[0], [TMP1], STRIDE ++ vzip &xreg1&.8b, &xreg3&.8b ++ ld1 {&yacc2&.s}[2], [TMP2], STRIDE ++ vzip &xreg2&.8b, &xreg4&.8b ++ ld1 {&yacc2&.s}[1], [TMP1] ++ vzip &xreg3&.8b, &xreg4&.8b ++ ld1 {&yacc2&.s}[3], [TMP2] ++ vzip &xreg1&.8b, &xreg2&.8b ++ convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 ++ umull &xacc1&.8h, &xreg1&.8b, v28.8b ++ vzip &yreg1&.8b, &yreg3&.8b ++ umlal &xacc1&.8h, &xreg2&.8b, v29.8b ++ vzip &yreg2&.8b, &yreg4&.8b ++ umull &xacc2&.8h, &xreg3&.8b, v28.8b ++ vzip &yreg3&.8b, &yreg4&.8b ++ umlal &xacc2&.8h, &xreg4&.8b, v29.8b ++ vzip &yreg1&.8b, &yreg2&.8b ++ umull &yacc1&.8h, &yreg1&.8b, v28.8b ++ umlal &yacc1&.8h, &yreg2&.8b, v29.8b ++ umull &yacc2&.8h, &yreg3&.8b, v28.8b ++ umlal &yacc2&.8h, &yreg4&.8b, v29.8b ++.endm ++ ++.macro bilinear_store_8888 numpix, tmp1, tmp2 ++.if numpix == 4 ++ st1 {v0.2s, v1.2s}, [OUT], #16 ++.elseif numpix == 2 ++ st1 {v0.2s}, [OUT], #8 ++.elseif numpix == 1 ++ st1 {v0.s}[0], [OUT], #4 ++.else ++ .error bilinear_store_8888 numpix is unsupported ++.endif ++.endm ++ ++.macro bilinear_store_0565 numpix, tmp1, tmp2 ++ vuzp v0.8b, v1.8b ++ vuzp v2.8b, v3.8b ++ vuzp v1.8b, v3.8b ++ vuzp v0.8b, v2.8b ++ convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2 ++.if numpix == 4 ++ st1 {v1.4h}, [OUT], #8 ++.elseif numpix == 2 ++ st1 {v1.s}[0], [OUT], #4 ++.elseif numpix == 1 ++ st1 {v1.h}[0], [OUT], #2 ++.else ++ .error bilinear_store_0565 numpix is unsupported ++.endif ++.endm ++ ++ ++/* ++ * Macros for loading mask pixels into register 'mask'. ++ * dup must be done in somewhere else. ++ */ ++.macro bilinear_load_mask_x numpix, mask ++.endm ++ ++.macro bilinear_load_mask_8 numpix, mask ++.if numpix == 4 ++ ld1 {&mask&.s}[0], [MASK], #4 ++.elseif numpix == 2 ++ ld1 {&mask&.h}[0], [MASK], #2 ++.elseif numpix == 1 ++ ld1 {&mask&.b}[0], [MASK], #1 ++.else ++ .error bilinear_load_mask_8 numpix is unsupported ++.endif ++ prfm PREFETCH_MODE, [MASK, #prefetch_offset] ++.endm ++ ++.macro bilinear_load_mask mask_fmt, numpix, mask ++ bilinear_load_mask_&mask_fmt numpix, mask ++.endm ++ ++ ++/* ++ * Macros for loading destination pixels into register 'dst0' and 'dst1'. ++ * Interleave should be done somewhere else. ++ */ ++.macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01 ++.endm ++ ++.macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01 ++.endm ++ ++.macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01 ++.if numpix == 4 ++ ld1 {&dst0&.2s, &dst1&.2s}, [OUT] ++.elseif numpix == 2 ++ ld1 {&dst0&.2s}, [OUT] ++.elseif numpix == 1 ++ ld1 {&dst0&.s}[0], [OUT] ++.else ++ .error bilinear_load_dst_8888 numpix is unsupported ++.endif ++ mov &dst01&.d[0], &dst0&.d[0] ++ mov &dst01&.d[1], &dst1&.d[0] ++ prfm PREFETCH_MODE, [OUT, #(prefetch_offset * 4)] ++.endm ++ ++.macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01 ++ bilinear_load_dst_8888 numpix, dst0, dst1, dst01 ++.endm ++ ++.macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01 ++ bilinear_load_dst_8888 numpix, dst0, dst1, dst01 ++.endm ++ ++.macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01 ++ bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01 ++.endm ++ ++/* ++ * Macros for duplicating partially loaded mask to fill entire register. ++ * We will apply mask to interleaved source pixels, that is ++ * (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3) ++ * (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3) ++ * So, we need to duplicate loaded mask into whole register. ++ * ++ * For two pixel case ++ * (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1) ++ * (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1) ++ * We can do some optimizations for this including last pixel cases. ++ */ ++.macro bilinear_duplicate_mask_x numpix, mask ++.endm ++ ++.macro bilinear_duplicate_mask_8 numpix, mask ++.if numpix == 4 ++ dup &mask&.2s, &mask&.s[0] ++.elseif numpix == 2 ++ dup &mask&.4h, &mask&.h[0] ++.elseif numpix == 1 ++ dup &mask&.8b, &mask&.b[0] ++.else ++ .error bilinear_duplicate_mask_8 is unsupported ++.endif ++.endm ++ ++.macro bilinear_duplicate_mask mask_fmt, numpix, mask ++ bilinear_duplicate_mask_&mask_fmt numpix, mask ++.endm ++ ++/* ++ * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form. ++ * Interleave should be done when maks is enabled or operator is 'over'. ++ */ ++.macro bilinear_interleave src0, src1, src01, dst0, dst1, dst01 ++ vuzp &src0&.8b, &src1&.8b ++ vuzp &dst0&.8b, &dst1&.8b ++ vuzp &src0&.8b, &src1&.8b ++ vuzp &dst0&.8b, &dst1&.8b ++ mov &src01&.d[1], &src1&.d[0] ++ mov &src01&.d[0], &src0&.d[0] ++ mov &dst01&.d[1], &dst1&.d[0] ++ mov &dst01&.d[0], &dst0&.d[0] ++.endm ++ ++.macro bilinear_interleave_src_dst_x_src \ ++ numpix, src0, src1, src01, dst0, dst1, dst01 ++.endm ++ ++.macro bilinear_interleave_src_dst_x_over \ ++ numpix, src0, src1, src01, dst0, dst1, dst01 ++ ++ bilinear_interleave src0, src1, src01, dst0, dst1, dst01 ++.endm ++ ++.macro bilinear_interleave_src_dst_x_add \ ++ numpix, src0, src1, src01, dst0, dst1, dst01 ++ bilinear_interleave src0, src1, src01, dst0, dst1, dst01 ++.endm ++ ++.macro bilinear_interleave_src_dst_8_src \ ++ numpix, src0, src1, src01, dst0, dst1, dst01 ++ ++ bilinear_interleave src0, src1, src01, dst0, dst1, dst01 ++.endm ++ ++.macro bilinear_interleave_src_dst_8_over \ ++ numpix, src0, src1, src01, dst0, dst1, dst01 ++ ++ bilinear_interleave src0, src1, src01, dst0, dst1, dst01 ++.endm ++ ++.macro bilinear_interleave_src_dst_8_add \ ++ numpix, src0, src1, src01, dst0, dst1, dst01 ++ ++ bilinear_interleave src0, src1, src01, dst0, dst1, dst01 ++.endm ++ ++.macro bilinear_interleave_src_dst \ ++ mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01 ++ ++ bilinear_interleave_src_dst_&mask_fmt&_&op \ ++ numpix, src0, src1, src01, dst0, dst1, dst01 ++.endm ++ ++ ++/* ++ * Macros for applying masks to src pixels. (see combine_mask_u() function) ++ * src, dst should be in interleaved form. ++ * mask register should be in form (m0, m1, m2, m3). ++ */ ++.macro bilinear_apply_mask_to_src_x \ ++ numpix, src0, src1, src01, mask, \ ++ tmp01, tmp23, tmp45, tmp67 ++.endm ++ ++.macro bilinear_apply_mask_to_src_8 \ ++ numpix, src0, src1, src01, mask, \ ++ tmp01, tmp23, tmp45, tmp67 ++ ++ umull &tmp01&.8h, &src0&.8b, &mask&.8b ++ umull &tmp23&.8h, &src1&.8b, &mask&.8b ++ /* bubbles */ ++ urshr &tmp45&.8h, &tmp01&.8h, #8 ++ urshr &tmp67&.8h, &tmp23&.8h, #8 ++ /* bubbles */ ++ raddhn &src0&.8b, &tmp45&.8h, &tmp01&.8h ++ raddhn &src1&.8b, &tmp67&.8h, &tmp23&.8h ++ mov &src01&.d[0], &src0&.d[0] ++ mov &src01&.d[1], &src1&.d[0] ++.endm ++ ++.macro bilinear_apply_mask_to_src \ ++ mask_fmt, numpix, src0, src1, src01, mask, \ ++ tmp01, tmp23, tmp45, tmp67 ++ ++ bilinear_apply_mask_to_src_&mask_fmt \ ++ numpix, src0, src1, src01, mask, \ ++ tmp01, tmp23, tmp45, tmp67 ++.endm ++ ++ ++/* ++ * Macros for combining src and destination pixels. ++ * Interleave or not is depending on operator 'op'. ++ */ ++.macro bilinear_combine_src \ ++ numpix, src0, src1, src01, dst0, dst1, dst01, \ ++ tmp01, tmp23, tmp45, tmp67, tmp8 ++.endm ++ ++.macro bilinear_combine_over \ ++ numpix, src0, src1, src01, dst0, dst1, dst01, \ ++ tmp01, tmp23, tmp45, tmp67, tmp8 ++ ++ dup &tmp8&.2s, &src1&.s[1] ++ /* bubbles */ ++ mvn &tmp8&.8b, &tmp8&.8b ++ /* bubbles */ ++ umull &tmp01&.8h, &dst0&.8b, &tmp8&.8b ++ /* bubbles */ ++ umull &tmp23&.8h, &dst1&.8b, &tmp8&.8b ++ /* bubbles */ ++ urshr &tmp45&.8h, &tmp01&.8h, #8 ++ urshr &tmp67&.8h, &tmp23&.8h, #8 ++ /* bubbles */ ++ raddhn &dst0&.8b, &tmp45&.8h, &tmp01&.8h ++ raddhn &dst1&.8b, &tmp67&.8h, &tmp23&.8h ++ mov &dst01&.d[0], &dst0&.d[0] ++ mov &dst01&.d[1], &dst1&.d[0] ++ /* bubbles */ ++ uqadd &src0&.8b, &dst0&.8b, &src0&.8b ++ uqadd &src1&.8b, &dst1&.8b, &src1&.8b ++ mov &src01&.d[0], &src0&.d[0] ++ mov &src01&.d[1], &src1&.d[0] ++.endm ++ ++.macro bilinear_combine_add \ ++ numpix, src0, src1, src01, dst0, dst1, dst01, \ ++ tmp01, tmp23, tmp45, tmp67, tmp8 ++ ++ uqadd &src0&.8b, &dst0&.8b, &src0&.8b ++ uqadd &src1&.8b, &dst1&.8b, &src1&.8b ++ mov &src01&.d[0], &src0&.d[0] ++ mov &src01&.d[1], &src1&.d[0] ++.endm ++ ++.macro bilinear_combine \ ++ op, numpix, src0, src1, src01, dst0, dst1, dst01, \ ++ tmp01, tmp23, tmp45, tmp67, tmp8 ++ ++ bilinear_combine_&op \ ++ numpix, src0, src1, src01, dst0, dst1, dst01, \ ++ tmp01, tmp23, tmp45, tmp67, tmp8 ++.endm ++ ++/* ++ * Macros for final deinterleaving of destination pixels if needed. ++ */ ++.macro bilinear_deinterleave numpix, dst0, dst1, dst01 ++ vuzp &dst0&.8b, &dst1&.8b ++ /* bubbles */ ++ vuzp &dst0&.8b, &dst1&.8b ++ mov &dst01&.d[0], &dst0&.d[0] ++ mov &dst01&.d[1], &dst1&.d[0] ++.endm ++ ++.macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01 ++.endm ++ ++.macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01 ++ bilinear_deinterleave numpix, dst0, dst1, dst01 ++.endm ++ ++.macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01 ++ bilinear_deinterleave numpix, dst0, dst1, dst01 ++.endm ++ ++.macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01 ++ bilinear_deinterleave numpix, dst0, dst1, dst01 ++.endm ++ ++.macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01 ++ bilinear_deinterleave numpix, dst0, dst1, dst01 ++.endm ++ ++.macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01 ++ bilinear_deinterleave numpix, dst0, dst1, dst01 ++.endm ++ ++.macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01 ++ bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01 ++.endm ++ ++ ++.macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op ++ bilinear_load_&src_fmt v0, v1, v2 ++ bilinear_load_mask mask_fmt, 1, v4 ++ bilinear_load_dst dst_fmt, op, 1, v18, v19, v9 ++ umull v2.8h, v0.8b, v28.8b ++ umlal v2.8h, v1.8b, v29.8b ++ /* 5 cycles bubble */ ++ ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS ++ umlsl v0.4s, v2.4h, v15.h[0] ++ umlal2 v0.4s, v2.8h, v15.h[0] ++ /* 5 cycles bubble */ ++ bilinear_duplicate_mask mask_fmt, 1, v4 ++ shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ /* 3 cycles bubble */ ++ xtn v0.8b, v0.8h ++ /* 1 cycle bubble */ ++ bilinear_interleave_src_dst \ ++ mask_fmt, op, 1, v0, v1, v0, v18, v19, v9 ++ bilinear_apply_mask_to_src \ ++ mask_fmt, 1, v0, v1, v0, v4, \ ++ v3, v8, v10, v11 ++ bilinear_combine \ ++ op, 1, v0, v1, v0, v18, v19, v9, \ ++ v3, v8, v10, v11, v5 ++ bilinear_deinterleave_dst mask_fmt, op, 1, v0, v1, v0 ++ bilinear_store_&dst_fmt 1, v17, v18 ++.endm ++ ++.macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op ++ bilinear_load_and_vertical_interpolate_two_&src_fmt \ ++ v1, v11, v18, v19, v20, v21, v22, v23 ++ bilinear_load_mask mask_fmt, 2, v4 ++ bilinear_load_dst dst_fmt, op, 2, v18, v19, v9 ++ ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS ++ umlsl v0.4s, v1.4h, v15.h[0] ++ umlal2 v0.4s, v1.8h, v15.h[0] ++ ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS ++ umlsl v10.4s, v11.4h, v15.h[4] ++ umlal2 v10.4s, v11.8h, v15.h[4] ++ shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ bilinear_duplicate_mask mask_fmt, 2, v4 ++ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) ++ add v12.8h, v12.8h, v13.8h ++ xtn v0.8b, v0.8h ++ bilinear_interleave_src_dst \ ++ mask_fmt, op, 2, v0, v1, v0, v18, v19, v9 ++ bilinear_apply_mask_to_src \ ++ mask_fmt, 2, v0, v1, v0, v4, \ ++ v3, v8, v10, v11 ++ bilinear_combine \ ++ op, 2, v0, v1, v0, v18, v19, v9, \ ++ v3, v8, v10, v11, v5 ++ bilinear_deinterleave_dst mask_fmt, op, 2, v0, v1, v0 ++ bilinear_store_&dst_fmt 2, v16, v17 ++.endm ++ ++.macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op ++ bilinear_load_and_vertical_interpolate_four_&src_fmt \ ++ v1, v11, v4, v5, v6, v7, v22, v23 \ ++ v3, v9, v16, v17, v20, v21, v18, v19 ++ prfm PREFETCH_MODE, [TMP1, PF_OFFS] ++ sub TMP1, TMP1, STRIDE ++ prfm PREFETCH_MODE, [TMP1, PF_OFFS] ++ ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS ++ umlsl v0.4s, v1.4h, v15.h[0] ++ umlal2 v0.4s, v1.8h, v15.h[0] ++ ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS ++ umlsl v10.4s, v11.4h, v15.h[4] ++ umlal2 v10.4s, v11.8h, v15.h[4] ++ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) ++ ushll v2.4s, v3.4h, #BILINEAR_INTERPOLATION_BITS ++ umlsl v2.4s, v3.4h, v15.h[0] ++ umlal2 v2.4s, v3.8h, v15.h[0] ++ ushll v8.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS ++ umlsl v8.4s, v9.4h, v15.h[4] ++ umlal2 v8.4s, v9.8h, v15.h[4] ++ add v12.8h, v12.8h, v13.8h ++ shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ shrn2 v2.8h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ bilinear_load_mask mask_fmt, 4, v4 ++ bilinear_duplicate_mask mask_fmt, 4, v4 ++ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) ++ xtn v0.8b, v0.8h ++ xtn v1.8b, v2.8h ++ add v12.8h, v12.8h, v13.8h ++ bilinear_load_dst dst_fmt, op, 4, v2, v3, v21 ++ bilinear_interleave_src_dst \ ++ mask_fmt, op, 4, v0, v1, v0, v2, v3, v11 ++ bilinear_apply_mask_to_src \ ++ mask_fmt, 4, v0, v1, v0, v4, \ ++ v6, v8, v9, v10 ++ bilinear_combine \ ++ op, 4, v0, v1, v0, v2, v3, v1, \ ++ v6, v8, v9, v10, v23 ++ bilinear_deinterleave_dst mask_fmt, op, 4, v0, v1, v0 ++ bilinear_store_&dst_fmt 4, v6, v7 ++.endm ++ ++.set BILINEAR_FLAG_USE_MASK, 1 ++.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 ++ ++/* ++ * Main template macro for generating NEON optimized bilinear scanline functions. ++ * ++ * Bilinear scanline generator macro take folling arguments: ++ * fname - name of the function to generate ++ * src_fmt - source color format (8888 or 0565) ++ * dst_fmt - destination color format (8888 or 0565) ++ * src/dst_bpp_shift - (1 << bpp_shift) is the size of src/dst pixel in bytes ++ * process_last_pixel - code block that interpolate one pixel and does not ++ * update horizontal weight ++ * process_two_pixels - code block that interpolate two pixels and update ++ * horizontal weight ++ * process_four_pixels - code block that interpolate four pixels and update ++ * horizontal weight ++ * process_pixblock_head - head part of middle loop ++ * process_pixblock_tail - tail part of middle loop ++ * process_pixblock_tail_head - tail_head of middle loop ++ * pixblock_size - number of pixels processed in a single middle loop ++ * prefetch_distance - prefetch in the source image by that many pixels ahead ++ */ ++ ++.macro generate_bilinear_scanline_func \ ++ fname, \ ++ src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \ ++ bilinear_process_last_pixel, \ ++ bilinear_process_two_pixels, \ ++ bilinear_process_four_pixels, \ ++ bilinear_process_pixblock_head, \ ++ bilinear_process_pixblock_tail, \ ++ bilinear_process_pixblock_tail_head, \ ++ pixblock_size, \ ++ prefetch_distance, \ ++ flags ++ ++pixman_asm_function fname ++.if pixblock_size == 8 ++.elseif pixblock_size == 4 ++.else ++ .error unsupported pixblock size ++.endif ++ ++.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 ++ OUT .req x0 ++ TOP .req x1 ++ BOTTOM .req x2 ++ WT .req x3 ++ WWT .req w3 ++ WB .req x4 ++ WWB .req w4 ++ X .req w5 ++ UX .req w6 ++ WIDTH .req x7 ++ TMP1 .req x10 ++ WTMP1 .req w10 ++ TMP2 .req x11 ++ WTMP2 .req w11 ++ PF_OFFS .req x12 ++ TMP3 .req x13 ++ WTMP3 .req w13 ++ TMP4 .req x14 ++ WTMP4 .req w14 ++ STRIDE .req x15 ++ DUMMY .req x30 ++ ++ stp x29, x30, [sp, -16]! ++ mov x29, sp ++ sub sp, sp, 112 ++ sub x29, x29, 64 ++ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 ++ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 ++ stp x10, x11, [x29, -80] ++ stp x12, x13, [x29, -96] ++ stp x14, x15, [x29, -112] ++.else ++ OUT .req x0 ++ MASK .req x1 ++ TOP .req x2 ++ BOTTOM .req x3 ++ WT .req x4 ++ WWT .req w4 ++ WB .req x5 ++ WWB .req w5 ++ X .req w6 ++ UX .req w7 ++ WIDTH .req x8 ++ TMP1 .req x10 ++ WTMP1 .req w10 ++ TMP2 .req x11 ++ WTMP2 .req w11 ++ PF_OFFS .req x12 ++ TMP3 .req x13 ++ WTMP3 .req w13 ++ TMP4 .req x14 ++ WTMP4 .req w14 ++ STRIDE .req x15 ++ DUMMY .req x30 ++ ++ .set prefetch_offset, prefetch_distance ++ ++ stp x29, x30, [sp, -16]! ++ mov x29, sp ++ sub x29, x29, 64 ++ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 ++ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 ++ stp x10, x11, [x29, -80] ++ stp x12, x13, [x29, -96] ++ stp x14, x15, [x29, -112] ++ str x8, [x29, -120] ++ ldr w8, [x29, 16] ++ sub sp, sp, 120 ++.endif ++ ++ mov WTMP1, #prefetch_distance ++ umull PF_OFFS, WTMP1, UX ++ ++ sub STRIDE, BOTTOM, TOP ++ .unreq BOTTOM ++ ++ cmp WIDTH, #0 ++ ble 300f ++ ++ dup v12.8h, X ++ dup v13.8h, UX ++ dup v28.8b, WWT ++ dup v29.8b, WWB ++ mov v25.d[0], v12.d[1] ++ mov v26.d[0], v13.d[0] ++ add v25.4h, v25.4h, v26.4h ++ mov v12.d[1], v25.d[0] ++ ++ /* ensure good destination alignment */ ++ cmp WIDTH, #1 ++ blt 100f ++ tst OUT, #(1 << dst_bpp_shift) ++ beq 100f ++ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) ++ add v12.8h, v12.8h, v13.8h ++ bilinear_process_last_pixel ++ sub WIDTH, WIDTH, #1 ++100: ++ add v13.8h, v13.8h, v13.8h ++ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) ++ add v12.8h, v12.8h, v13.8h ++ ++ cmp WIDTH, #2 ++ blt 100f ++ tst OUT, #(1 << (dst_bpp_shift + 1)) ++ beq 100f ++ bilinear_process_two_pixels ++ sub WIDTH, WIDTH, #2 ++100: ++.if pixblock_size == 8 ++ cmp WIDTH, #4 ++ blt 100f ++ tst OUT, #(1 << (dst_bpp_shift + 2)) ++ beq 100f ++ bilinear_process_four_pixels ++ sub WIDTH, WIDTH, #4 ++100: ++.endif ++ subs WIDTH, WIDTH, #pixblock_size ++ blt 100f ++ asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift) ++ bilinear_process_pixblock_head ++ subs WIDTH, WIDTH, #pixblock_size ++ blt 500f ++0: ++ bilinear_process_pixblock_tail_head ++ subs WIDTH, WIDTH, #pixblock_size ++ bge 0b ++500: ++ bilinear_process_pixblock_tail ++100: ++.if pixblock_size == 8 ++ tst WIDTH, #4 ++ beq 200f ++ bilinear_process_four_pixels ++200: ++.endif ++ /* handle the remaining trailing pixels */ ++ tst WIDTH, #2 ++ beq 200f ++ bilinear_process_two_pixels ++200: ++ tst WIDTH, #1 ++ beq 300f ++ bilinear_process_last_pixel ++300: ++ ++.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0 ++ sub x29, x29, 64 ++ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 ++ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 ++ ldp x10, x11, [x29, -80] ++ ldp x12, x13, [x29, -96] ++ ldp x14, x15, [x29, -112] ++ mov sp, x29 ++ ldp x29, x30, [sp], 16 ++.else ++ sub x29, x29, 64 ++ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 ++ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 ++ ldp x10, x11, [x29, -80] ++ ldp x12, x13, [x29, -96] ++ ldp x14, x15, [x29, -112] ++ ldr x8, [x29, -120] ++ mov sp, x29 ++ ldp x29, x30, [sp], 16 ++.endif ++ ret ++ ++ .unreq OUT ++ .unreq TOP ++ .unreq WT ++ .unreq WWT ++ .unreq WB ++ .unreq WWB ++ .unreq X ++ .unreq UX ++ .unreq WIDTH ++ .unreq TMP1 ++ .unreq WTMP1 ++ .unreq TMP2 ++ .unreq PF_OFFS ++ .unreq TMP3 ++ .unreq TMP4 ++ .unreq STRIDE ++.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0 ++ .unreq MASK ++.endif ++ ++.endfunc ++ ++.endm ++ ++/* src_8888_8_8888 */ ++.macro bilinear_src_8888_8_8888_process_last_pixel ++ bilinear_interpolate_last_pixel 8888, 8, 8888, src ++.endm ++ ++.macro bilinear_src_8888_8_8888_process_two_pixels ++ bilinear_interpolate_two_pixels 8888, 8, 8888, src ++.endm ++ ++.macro bilinear_src_8888_8_8888_process_four_pixels ++ bilinear_interpolate_four_pixels 8888, 8, 8888, src ++.endm ++ ++.macro bilinear_src_8888_8_8888_process_pixblock_head ++ bilinear_src_8888_8_8888_process_four_pixels ++.endm ++ ++.macro bilinear_src_8888_8_8888_process_pixblock_tail ++.endm ++ ++.macro bilinear_src_8888_8_8888_process_pixblock_tail_head ++ bilinear_src_8888_8_8888_process_pixblock_tail ++ bilinear_src_8888_8_8888_process_pixblock_head ++.endm ++ ++/* src_8888_8_0565 */ ++.macro bilinear_src_8888_8_0565_process_last_pixel ++ bilinear_interpolate_last_pixel 8888, 8, 0565, src ++.endm ++ ++.macro bilinear_src_8888_8_0565_process_two_pixels ++ bilinear_interpolate_two_pixels 8888, 8, 0565, src ++.endm ++ ++.macro bilinear_src_8888_8_0565_process_four_pixels ++ bilinear_interpolate_four_pixels 8888, 8, 0565, src ++.endm ++ ++.macro bilinear_src_8888_8_0565_process_pixblock_head ++ bilinear_src_8888_8_0565_process_four_pixels ++.endm ++ ++.macro bilinear_src_8888_8_0565_process_pixblock_tail ++.endm ++ ++.macro bilinear_src_8888_8_0565_process_pixblock_tail_head ++ bilinear_src_8888_8_0565_process_pixblock_tail ++ bilinear_src_8888_8_0565_process_pixblock_head ++.endm ++ ++/* src_0565_8_x888 */ ++.macro bilinear_src_0565_8_x888_process_last_pixel ++ bilinear_interpolate_last_pixel 0565, 8, 8888, src ++.endm ++ ++.macro bilinear_src_0565_8_x888_process_two_pixels ++ bilinear_interpolate_two_pixels 0565, 8, 8888, src ++.endm ++ ++.macro bilinear_src_0565_8_x888_process_four_pixels ++ bilinear_interpolate_four_pixels 0565, 8, 8888, src ++.endm ++ ++.macro bilinear_src_0565_8_x888_process_pixblock_head ++ bilinear_src_0565_8_x888_process_four_pixels ++.endm ++ ++.macro bilinear_src_0565_8_x888_process_pixblock_tail ++.endm ++ ++.macro bilinear_src_0565_8_x888_process_pixblock_tail_head ++ bilinear_src_0565_8_x888_process_pixblock_tail ++ bilinear_src_0565_8_x888_process_pixblock_head ++.endm ++ ++/* src_0565_8_0565 */ ++.macro bilinear_src_0565_8_0565_process_last_pixel ++ bilinear_interpolate_last_pixel 0565, 8, 0565, src ++.endm ++ ++.macro bilinear_src_0565_8_0565_process_two_pixels ++ bilinear_interpolate_two_pixels 0565, 8, 0565, src ++.endm ++ ++.macro bilinear_src_0565_8_0565_process_four_pixels ++ bilinear_interpolate_four_pixels 0565, 8, 0565, src ++.endm ++ ++.macro bilinear_src_0565_8_0565_process_pixblock_head ++ bilinear_src_0565_8_0565_process_four_pixels ++.endm ++ ++.macro bilinear_src_0565_8_0565_process_pixblock_tail ++.endm ++ ++.macro bilinear_src_0565_8_0565_process_pixblock_tail_head ++ bilinear_src_0565_8_0565_process_pixblock_tail ++ bilinear_src_0565_8_0565_process_pixblock_head ++.endm ++ ++/* over_8888_8888 */ ++.macro bilinear_over_8888_8888_process_last_pixel ++ bilinear_interpolate_last_pixel 8888, x, 8888, over ++.endm ++ ++.macro bilinear_over_8888_8888_process_two_pixels ++ bilinear_interpolate_two_pixels 8888, x, 8888, over ++.endm ++ ++.macro bilinear_over_8888_8888_process_four_pixels ++ bilinear_interpolate_four_pixels 8888, x, 8888, over ++.endm ++ ++.macro bilinear_over_8888_8888_process_pixblock_head ++ asr WTMP1, X, #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, lsl #2 ++ asr WTMP2, X, #16 ++ add X, X, UX ++ add TMP2, TOP, TMP2, lsl #2 ++ ++ ld1 {v22.2s}, [TMP1], STRIDE ++ ld1 {v23.2s}, [TMP1] ++ asr WTMP3, X, #16 ++ add X, X, UX ++ add TMP3, TOP, TMP3, lsl #2 ++ umull v8.8h, v22.8b, v28.8b ++ umlal v8.8h, v23.8b, v29.8b ++ ++ ld1 {v22.2s}, [TMP2], STRIDE ++ ld1 {v23.2s}, [TMP2] ++ asr WTMP4, X, #16 ++ add X, X, UX ++ add TMP4, TOP, TMP4, lsl #2 ++ umull v9.8h, v22.8b, v28.8b ++ umlal v9.8h, v23.8b, v29.8b ++ ++ ld1 {v22.2s}, [TMP3], STRIDE ++ ld1 {v23.2s}, [TMP3] ++ umull v10.8h, v22.8b, v28.8b ++ umlal v10.8h, v23.8b, v29.8b ++ ++ ushll v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS ++ umlsl v0.4s, v8.4h, v15.h[0] ++ umlal2 v0.4s, v8.8h, v15.h[0] ++ ++ prfm PREFETCH_MODE, [TMP4, PF_OFFS] ++ ld1 {v16.2s}, [TMP4], STRIDE ++ ld1 {v17.2s}, [TMP4] ++ prfm PREFETCH_MODE, [TMP4, PF_OFFS] ++ umull v11.8h, v16.8b, v28.8b ++ umlal v11.8h, v17.8b, v29.8b ++ ++ ushll v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS ++ umlsl v1.4s, v9.4h, v15.h[4] ++ umlal2 v1.4s, v9.8h, v15.h[4] ++ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) ++ add v12.8h, v12.8h, v13.8h ++.endm ++ ++.macro bilinear_over_8888_8888_process_pixblock_tail ++ ushll v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS ++ umlsl v2.4s, v10.4h, v15.h[0] ++ umlal2 v2.4s, v10.8h, v15.h[0] ++ ushll v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS ++ umlsl v3.4s, v11.4h, v15.h[4] ++ umlal2 v3.4s, v11.8h, v15.h[4] ++ shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ shrn2 v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) ++ shrn2 v2.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ xtn v6.8b, v0.8h ++ xtn v7.8b, v2.8h ++ ld1 {v2.2s, v3.2s}, [OUT] ++ prfm PREFETCH_MODE, [OUT, #(prefetch_offset * 4)] ++ vuzp v6.8b, v7.8b ++ vuzp v2.8b, v3.8b ++ vuzp v6.8b, v7.8b ++ vuzp v2.8b, v3.8b ++ dup v4.2s, v7.s[1] ++ mvn v4.8b, v4.8b ++ umull v11.8h, v2.8b, v4.8b ++ umull v2.8h, v3.8b, v4.8b ++ urshr v1.8h, v11.8h, #8 ++ urshr v10.8h, v2.8h, #8 ++ raddhn v3.8b, v10.8h, v2.8h ++ raddhn v2.8b, v1.8h, v11.8h ++ uqadd v6.8b, v2.8b, v6.8b ++ uqadd v7.8b, v3.8b, v7.8b ++ vuzp v6.8b, v7.8b ++ vuzp v6.8b, v7.8b ++ add v12.8h, v12.8h, v13.8h ++ st1 {v6.2s, v7.2s}, [OUT], #16 ++.endm ++ ++.macro bilinear_over_8888_8888_process_pixblock_tail_head ++ ushll v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS ++ asr WTMP1, X, #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, lsl #2 ++ umlsl v2.4s, v10.4h, v15.h[0] ++ asr WTMP2, X, #16 ++ add X, X, UX ++ add TMP2, TOP, TMP2, lsl #2 ++ umlal2 v2.4s, v10.8h, v15.h[0] ++ ushll v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS ++ ld1 {v20.2s}, [TMP1], STRIDE ++ umlsl v3.4s, v11.4h, v15.h[4] ++ umlal2 v3.4s, v11.8h, v15.h[4] ++ ld1 {v21.2s}, [TMP1] ++ umull v8.8h, v20.8b, v28.8b ++ umlal v8.8h, v21.8b, v29.8b ++ shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ shrn2 v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) ++ ld1 {v22.2s}, [TMP2], STRIDE ++ shrn2 v2.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ xtn v6.8b, v0.8h ++ ld1 {v23.2s}, [TMP2] ++ umull v9.8h, v22.8b, v28.8b ++ asr WTMP3, X, #16 ++ add X, X, UX ++ add TMP3, TOP, TMP3, lsl #2 ++ asr WTMP4, X, #16 ++ add X, X, UX ++ add TMP4, TOP, TMP4, lsl #2 ++ umlal v9.8h, v23.8b, v29.8b ++ xtn v7.8b, v2.8h ++ ld1 {v2.2s, v3.2s}, [OUT] ++ prfm PREFETCH_MODE, [OUT, PF_OFFS] ++ ld1 {v22.2s}, [TMP3], STRIDE ++ vuzp v6.8b, v7.8b ++ vuzp v2.8b, v3.8b ++ vuzp v6.8b, v7.8b ++ vuzp v2.8b, v3.8b ++ dup v4.2s, v7.s[1] ++ ld1 {v23.2s}, [TMP3] ++ mvn v4.8b, v4.8b ++ umull v10.8h, v22.8b, v28.8b ++ umlal v10.8h, v23.8b, v29.8b ++ umull v11.8h, v2.8b, v4.8b ++ umull v2.8h, v3.8b, v4.8b ++ ushll v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS ++ umlsl v0.4s, v8.4h, v15.h[0] ++ urshr v1.8h, v11.8h, #8 ++ umlal2 v0.4s, v8.8h, v15.h[0] ++ urshr v8.8h, v2.8h, #8 ++ raddhn v3.8b, v8.8h, v2.8h ++ raddhn v2.8b, v1.8h, v11.8h ++ prfm PREFETCH_MODE, [TMP4, PF_OFFS] ++ ld1 {v16.2s}, [TMP4], STRIDE ++ uqadd v6.8b, v2.8b, v6.8b ++ uqadd v7.8b, v3.8b, v7.8b ++ ld1 {v17.2s}, [TMP4] ++ prfm PREFETCH_MODE, [TMP4, PF_OFFS] ++ umull v11.8h, v16.8b, v28.8b ++ umlal v11.8h, v17.8b, v29.8b ++ vuzp v6.8b, v7.8b ++ ushll v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS ++ vuzp v6.8b, v7.8b ++ umlsl v1.4s, v9.4h, v15.h[4] ++ add v12.8h, v12.8h, v13.8h ++ umlal2 v1.4s, v9.8h, v15.h[4] ++ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) ++ add v12.8h, v12.8h, v13.8h ++ st1 {v6.2s, v7.2s}, [OUT], #16 ++.endm ++ ++/* over_8888_8_8888 */ ++.macro bilinear_over_8888_8_8888_process_last_pixel ++ bilinear_interpolate_last_pixel 8888, 8, 8888, over ++.endm ++ ++.macro bilinear_over_8888_8_8888_process_two_pixels ++ bilinear_interpolate_two_pixels 8888, 8, 8888, over ++.endm ++ ++.macro bilinear_over_8888_8_8888_process_four_pixels ++ bilinear_interpolate_two_pixels 8888, 8, 8888, over ++ bilinear_interpolate_two_pixels 8888, 8, 8888, over ++.endm ++ ++.macro bilinear_over_8888_8_8888_process_pixblock_head ++ bilinear_over_8888_8_8888_process_four_pixels ++.endm ++ ++.macro bilinear_over_8888_8_8888_process_pixblock_tail ++.endm ++ ++.macro bilinear_over_8888_8_8888_process_pixblock_tail_head ++ bilinear_over_8888_8_8888_process_pixblock_tail ++ bilinear_over_8888_8_8888_process_pixblock_head ++.endm ++ ++/* add_8888_8888 */ ++.macro bilinear_add_8888_8888_process_last_pixel ++ bilinear_interpolate_last_pixel 8888, x, 8888, add ++.endm ++ ++.macro bilinear_add_8888_8888_process_two_pixels ++ bilinear_interpolate_two_pixels 8888, x, 8888, add ++.endm ++ ++.macro bilinear_add_8888_8888_process_four_pixels ++ bilinear_interpolate_two_pixels 8888, x, 8888, add ++ bilinear_interpolate_two_pixels 8888, x, 8888, add ++.endm ++ ++.macro bilinear_add_8888_8888_process_pixblock_head ++ bilinear_add_8888_8888_process_four_pixels ++.endm ++ ++.macro bilinear_add_8888_8888_process_pixblock_tail ++.endm ++ ++.macro bilinear_add_8888_8888_process_pixblock_tail_head ++ bilinear_add_8888_8888_process_pixblock_tail ++ bilinear_add_8888_8888_process_pixblock_head ++.endm ++ ++/* add_8888_8_8888 */ ++.macro bilinear_add_8888_8_8888_process_last_pixel ++ bilinear_interpolate_last_pixel 8888, 8, 8888, add ++.endm ++ ++.macro bilinear_add_8888_8_8888_process_two_pixels ++ bilinear_interpolate_two_pixels 8888, 8, 8888, add ++.endm ++ ++.macro bilinear_add_8888_8_8888_process_four_pixels ++ bilinear_interpolate_four_pixels 8888, 8, 8888, add ++.endm ++ ++.macro bilinear_add_8888_8_8888_process_pixblock_head ++ bilinear_add_8888_8_8888_process_four_pixels ++.endm ++ ++.macro bilinear_add_8888_8_8888_process_pixblock_tail ++.endm ++ ++.macro bilinear_add_8888_8_8888_process_pixblock_tail_head ++ bilinear_add_8888_8_8888_process_pixblock_tail ++ bilinear_add_8888_8_8888_process_pixblock_head ++.endm ++ ++ ++/* Bilinear scanline functions */ ++generate_bilinear_scanline_func \ ++ pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \ ++ 8888, 8888, 2, 2, \ ++ bilinear_src_8888_8_8888_process_last_pixel, \ ++ bilinear_src_8888_8_8888_process_two_pixels, \ ++ bilinear_src_8888_8_8888_process_four_pixels, \ ++ bilinear_src_8888_8_8888_process_pixblock_head, \ ++ bilinear_src_8888_8_8888_process_pixblock_tail, \ ++ bilinear_src_8888_8_8888_process_pixblock_tail_head, \ ++ 4, 28, BILINEAR_FLAG_USE_MASK ++ ++generate_bilinear_scanline_func \ ++ pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \ ++ 8888, 0565, 2, 1, \ ++ bilinear_src_8888_8_0565_process_last_pixel, \ ++ bilinear_src_8888_8_0565_process_two_pixels, \ ++ bilinear_src_8888_8_0565_process_four_pixels, \ ++ bilinear_src_8888_8_0565_process_pixblock_head, \ ++ bilinear_src_8888_8_0565_process_pixblock_tail, \ ++ bilinear_src_8888_8_0565_process_pixblock_tail_head, \ ++ 4, 28, BILINEAR_FLAG_USE_MASK ++ ++generate_bilinear_scanline_func \ ++ pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \ ++ 0565, 8888, 1, 2, \ ++ bilinear_src_0565_8_x888_process_last_pixel, \ ++ bilinear_src_0565_8_x888_process_two_pixels, \ ++ bilinear_src_0565_8_x888_process_four_pixels, \ ++ bilinear_src_0565_8_x888_process_pixblock_head, \ ++ bilinear_src_0565_8_x888_process_pixblock_tail, \ ++ bilinear_src_0565_8_x888_process_pixblock_tail_head, \ ++ 4, 28, BILINEAR_FLAG_USE_MASK ++ ++generate_bilinear_scanline_func \ ++ pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \ ++ 0565, 0565, 1, 1, \ ++ bilinear_src_0565_8_0565_process_last_pixel, \ ++ bilinear_src_0565_8_0565_process_two_pixels, \ ++ bilinear_src_0565_8_0565_process_four_pixels, \ ++ bilinear_src_0565_8_0565_process_pixblock_head, \ ++ bilinear_src_0565_8_0565_process_pixblock_tail, \ ++ bilinear_src_0565_8_0565_process_pixblock_tail_head, \ ++ 4, 28, BILINEAR_FLAG_USE_MASK ++ ++generate_bilinear_scanline_func \ ++ pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \ ++ 8888, 8888, 2, 2, \ ++ bilinear_over_8888_8888_process_last_pixel, \ ++ bilinear_over_8888_8888_process_two_pixels, \ ++ bilinear_over_8888_8888_process_four_pixels, \ ++ bilinear_over_8888_8888_process_pixblock_head, \ ++ bilinear_over_8888_8888_process_pixblock_tail, \ ++ bilinear_over_8888_8888_process_pixblock_tail_head, \ ++ 4, 28, 0 ++ ++generate_bilinear_scanline_func \ ++ pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \ ++ 8888, 8888, 2, 2, \ ++ bilinear_over_8888_8_8888_process_last_pixel, \ ++ bilinear_over_8888_8_8888_process_two_pixels, \ ++ bilinear_over_8888_8_8888_process_four_pixels, \ ++ bilinear_over_8888_8_8888_process_pixblock_head, \ ++ bilinear_over_8888_8_8888_process_pixblock_tail, \ ++ bilinear_over_8888_8_8888_process_pixblock_tail_head, \ ++ 4, 28, BILINEAR_FLAG_USE_MASK ++ ++generate_bilinear_scanline_func \ ++ pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \ ++ 8888, 8888, 2, 2, \ ++ bilinear_add_8888_8888_process_last_pixel, \ ++ bilinear_add_8888_8888_process_two_pixels, \ ++ bilinear_add_8888_8888_process_four_pixels, \ ++ bilinear_add_8888_8888_process_pixblock_head, \ ++ bilinear_add_8888_8888_process_pixblock_tail, \ ++ bilinear_add_8888_8888_process_pixblock_tail_head, \ ++ 4, 28, 0 ++ ++generate_bilinear_scanline_func \ ++ pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \ ++ 8888, 8888, 2, 2, \ ++ bilinear_add_8888_8_8888_process_last_pixel, \ ++ bilinear_add_8888_8_8888_process_two_pixels, \ ++ bilinear_add_8888_8_8888_process_four_pixels, \ ++ bilinear_add_8888_8_8888_process_pixblock_head, \ ++ bilinear_add_8888_8_8888_process_pixblock_tail, \ ++ bilinear_add_8888_8_8888_process_pixblock_tail_head, \ ++ 4, 28, BILINEAR_FLAG_USE_MASK +diff --git a/pixman/pixman-arma64-neon-asm.S b/pixman/pixman-arma64-neon-asm.S +new file mode 100644 +index 0000000..18ace0e +--- /dev/null ++++ b/pixman/pixman-arma64-neon-asm.S +@@ -0,0 +1,3704 @@ ++/* ++ * Copyright © 2009 Nokia Corporation ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++ * DEALINGS IN THE SOFTWARE. ++ * ++ * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) ++ */ ++ ++/* ++ * This file contains implementations of NEON optimized pixel processing ++ * functions. There is no full and detailed tutorial, but some functions ++ * (those which are exposing some new or interesting features) are ++ * extensively commented and can be used as examples. ++ * ++ * You may want to have a look at the comments for following functions: ++ * - pixman_composite_over_8888_0565_asm_neon ++ * - pixman_composite_over_n_8_0565_asm_neon ++ */ ++ ++/* Prevent the stack from becoming executable for no reason... */ ++#if defined(__linux__) && defined(__ELF__) ++.section .note.GNU-stack,"",%progbits ++#endif ++ ++.text ++.arch armv8-a ++ ++.altmacro ++.p2align 2 ++ ++#include "pixman-private.h" ++#include "pixman-arm-asm.h" ++#include "pixman-arma64-neon-asm.h" ++ ++/* Global configuration options and preferences */ ++ ++/* ++ * The code can optionally make use of unaligned memory accesses to improve ++ * performance of handling leading/trailing pixels for each scanline. ++ * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for ++ * example in linux if unaligned memory accesses are not configured to ++ * generate.exceptions. ++ */ ++.set RESPECT_STRICT_ALIGNMENT, 1 ++ ++/* ++ * Set default prefetch type. There is a choice between the following options: ++ * ++ * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work ++ * as NOP to workaround some HW bugs or for whatever other reason) ++ * ++ * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where ++ * advanced prefetch intruduces heavy overhead) ++ * ++ * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8 ++ * which can run ARM and NEON instructions simultaneously so that extra ARM ++ * instructions do not add (many) extra cycles, but improve prefetch efficiency) ++ * ++ * Note: some types of function can't support advanced prefetch and fallback ++ * to simple one (those which handle 24bpp pixels) ++ */ ++.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED ++ ++/* Prefetch distance in pixels for simple prefetch */ ++.set PREFETCH_DISTANCE_SIMPLE, 64 ++ ++/* ++ * Implementation of pixman_composite_over_8888_0565_asm_neon ++ * ++ * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and ++ * performs OVER compositing operation. Function fast_composite_over_8888_0565 ++ * from pixman-fast-path.c does the same in C and can be used as a reference. ++ * ++ * First we need to have some NEON assembly code which can do the actual ++ * operation on the pixels and provide it to the template macro. ++ * ++ * Template macro quite conveniently takes care of emitting all the necessary ++ * code for memory reading and writing (including quite tricky cases of ++ * handling unaligned leading/trailing pixels), so we only need to deal with ++ * the data in NEON registers. ++ * ++ * NEON registers allocation in general is recommented to be the following: ++ * v0, v1, v2, v3 - contain loaded source pixel data ++ * v4, v5, v6, v7 - contain loaded destination pixels (if they are needed) ++ * v24, v25, v26, v27 - contain loading mask pixel data (if mask is used) ++ * v28, v29, v30, v31 - place for storing the result (destination pixels) ++ * ++ * As can be seen above, four 64-bit NEON registers are used for keeping ++ * intermediate pixel data and up to 8 pixels can be processed in one step ++ * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp). ++ * ++ * This particular function uses the following registers allocation: ++ * v0, v1, v2, v3 - contain loaded source pixel data ++ * v4, v5 - contain loaded destination pixels (they are needed) ++ * v28, v29 - place for storing the result (destination pixels) ++ */ ++ ++/* ++ * Step one. We need to have some code to do some arithmetics on pixel data. ++ * This is implemented as a pair of macros: '*_head' and '*_tail'. When used ++ * back-to-back, they take pixel data from {v0, v1, v2, v3} and {v4, v5}, ++ * perform all the needed calculations and write the result to {v28, v29}. ++ * The rationale for having two macros and not just one will be explained ++ * later. In practice, any single monolitic function which does the work can ++ * be split into two parts in any arbitrary way without affecting correctness. ++ * ++ * There is one special trick here too. Common template macro can optionally ++ * make our life a bit easier by doing R, G, B, A color components ++ * deinterleaving for 32bpp pixel formats (and this feature is used in ++ * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that ++ * instead of having 8 packed pixels in {v0, v1, v2, v3} registers, we ++ * actually use v0 register for blue channel (a vector of eight 8-bit ++ * values), v1 register for green, v2 for red and v3 for alpha. This ++ * simple conversion can be also done with a few NEON instructions: ++ * ++ * Packed to planar conversion: // vuzp8 is a wrapper macro ++ * vuzp8 v0, v1 ++ * vuzp8 v2, v3 ++ * vuzp8 v1, v3 ++ * vuzp8 v0, v2 ++ * ++ * Planar to packed conversion: // vzip8 is a wrapper macro ++ * vzip8 v0, v2 ++ * vzip8 v1, v3 ++ * vzip8 v2, v3 ++ * vzip8 v0, v1 ++ * ++ * But pixel can be loaded directly in planar format using LD4 / b NEON ++ * instruction. It is 1 cycle slower than LD1 / s, so this is not always ++ * desirable, that's why deinterleaving is optional. ++ * ++ * But anyway, here is the code: ++ */ ++ ++.macro pixman_composite_over_8888_0565_process_pixblock_head ++ /* convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format ++ and put data into v6 - red, v7 - green, v30 - blue */ ++ mov v4.d[1], v5.d[0] ++ shrn v6.8b, v4.8h, #8 ++ shrn v7.8b, v4.8h, #3 ++ sli v4.8h, v4.8h, #5 ++ sri v6.8b, v6.8b, #5 ++ mvn v3.8b, v3.8b /* invert source alpha */ ++ sri v7.8b, v7.8b, #6 ++ shrn v30.8b, v4.8h, #2 ++ /* now do alpha blending, storing results in 8-bit planar format ++ into v20 - red, v23 - green, v22 - blue */ ++ umull v10.8h, v3.8b, v6.8b ++ umull v11.8h, v3.8b, v7.8b ++ umull v12.8h, v3.8b, v30.8b ++ urshr v17.8h, v10.8h, #8 ++ urshr v18.8h, v11.8h, #8 ++ urshr v19.8h, v12.8h, #8 ++ raddhn v20.8b, v10.8h, v17.8h ++ raddhn v23.8b, v11.8h, v18.8h ++ raddhn v22.8b, v12.8h, v19.8h ++.endm ++ ++.macro pixman_composite_over_8888_0565_process_pixblock_tail ++ /* ... continue alpha blending */ ++ uqadd v17.8b, v2.8b, v20.8b ++ uqadd v18.8b, v0.8b, v22.8b ++ uqadd v19.8b, v1.8b, v23.8b ++ /* convert the result to r5g6b5 and store it into {v14} */ ++ ushll v14.8h, v17.8b, #7 ++ sli v14.8h, v14.8h, #1 ++ ushll v8.8h, v19.8b, #7 ++ sli v8.8h, v8.8h, #1 ++ ushll v9.8h, v18.8b, #7 ++ sli v9.8h, v9.8h, #1 ++ sri v14.8h, v8.8h, #5 ++ sri v14.8h, v9.8h, #11 ++ mov v28.d[0], v14.d[0] ++ mov v29.d[0], v14.d[1] ++.endm ++ ++/* ++ * OK, now we got almost everything that we need. Using the above two ++ * macros, the work can be done right. But now we want to optimize ++ * it a bit. ARM Cortex-A8 is an in-order core, and benefits really ++ * a lot from good code scheduling and software pipelining. ++ * ++ * Let's construct some code, which will run in the core main loop. ++ * Some pseudo-code of the main loop will look like this: ++ * head ++ * while (...) { ++ * tail ++ * head ++ * } ++ * tail ++ * ++ * It may look a bit weird, but this setup allows to hide instruction ++ * latencies better and also utilize dual-issue capability more ++ * efficiently (make pairs of load-store and ALU instructions). ++ * ++ * So what we need now is a '*_tail_head' macro, which will be used ++ * in the core main loop. A trivial straightforward implementation ++ * of this macro would look like this: ++ * ++ * pixman_composite_over_8888_0565_process_pixblock_tail ++ * st1 {v28.4h, v29.4h}, [DST_W], #32 ++ * ld1 {v4.4h, v5.4h}, [DST_R], #16 ++ * ld4 {v0.2s, v1.2s, v2.2s, v3.2s}, [SRC], #32 ++ * pixman_composite_over_8888_0565_process_pixblock_head ++ * cache_preload 8, 8 ++ * ++ * Now it also got some VLD/VST instructions. We simply can't move from ++ * processing one block of pixels to the other one with just arithmetics. ++ * The previously processed data needs to be written to memory and new ++ * data needs to be fetched. Fortunately, this main loop does not deal ++ * with partial leading/trailing pixels and can load/store a full block ++ * of pixels in a bulk. Additionally, destination buffer is already ++ * 16 bytes aligned here (which is good for performance). ++ * ++ * New things here are DST_R, DST_W, SRC and MASK identifiers. These ++ * are the aliases for ARM registers which are used as pointers for ++ * accessing data. We maintain separate pointers for reading and writing ++ * destination buffer (DST_R and DST_W). ++ * ++ * Another new thing is 'cache_preload' macro. It is used for prefetching ++ * data into CPU L2 cache and improve performance when dealing with large ++ * images which are far larger than cache size. It uses one argument ++ * (actually two, but they need to be the same here) - number of pixels ++ * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some ++ * details about this macro. Moreover, if good performance is needed ++ * the code from this macro needs to be copied into '*_tail_head' macro ++ * and mixed with the rest of code for optimal instructions scheduling. ++ * We are actually doing it below. ++ * ++ * Now after all the explanations, here is the optimized code. ++ * Different instruction streams (originaling from '*_head', '*_tail' ++ * and 'cache_preload' macro) use different indentation levels for ++ * better readability. Actually taking the code from one of these ++ * indentation levels and ignoring a few LD/ST instructions would ++ * result in exactly the code from '*_head', '*_tail' or 'cache_preload' ++ * macro! ++ */ ++ ++#if 1 ++ ++.macro pixman_composite_over_8888_0565_process_pixblock_tail_head ++ uqadd v17.8b, v2.8b, v20.8b ++ ld1 {v4.4h, v5.4h}, [DST_R], #16 ++ mov v4.d[1], v5.d[0] ++ uqadd v18.8b, v0.8b, v22.8b ++ uqadd v19.8b, v1.8b, v23.8b ++ shrn v6.8b, v4.8h, #8 ++ fetch_src_pixblock ++ shrn v7.8b, v4.8h, #3 ++ sli v4.8h, v4.8h, #5 ++ ushll v14.8h, v17.8b, #7 ++ sli v14.8h, v14.8h, #1 ++ PF add PF_X, PF_X, #8 ++ ushll v8.8h, v19.8b, #7 ++ sli v8.8h, v8.8h, #1 ++ PF tst PF_CTL, #0xF ++ sri v6.8b, v6.8b, #5 ++ PF beq 10f ++ PF add PF_X, PF_X, #8 ++10: ++ mvn v3.8b, v3.8b ++ PF beq 10f ++ PF sub PF_CTL, PF_CTL, #1 ++10: ++ sri v7.8b, v7.8b, #6 ++ shrn v30.8b, v4.8h, #2 ++ umull v10.8h, v3.8b, v6.8b ++ PF lsl DUMMY, PF_X, #src_bpp_shift ++ PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] ++ umull v11.8h, v3.8b, v7.8b ++ umull v12.8h, v3.8b, v30.8b ++ PF lsl DUMMY, PF_X, #dst_bpp_shift ++ PF prfm PREFETCH_MODE, [PF_DST, DUMMY] ++ sri v14.8h, v8.8h, #5 ++ PF cmp PF_X, ORIG_W ++ ushll v9.8h, v18.8b, #7 ++ sli v9.8h, v9.8h, #1 ++ urshr v17.8h, v10.8h, #8 ++ PF ble 10f ++ PF sub PF_X, PF_X, ORIG_W ++10: ++ urshr v19.8h, v11.8h, #8 ++ urshr v18.8h, v12.8h, #8 ++ PF ble 10f ++ PF subs PF_CTL, PF_CTL, #0x10 ++10: ++ sri v14.8h, v9.8h, #11 ++ mov v28.d[0], v14.d[0] ++ mov v29.d[0], v14.d[1] ++ PF ble 10f ++ PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF ldrsb DUMMY, [PF_SRC, DUMMY] ++ PF add PF_SRC, PF_SRC, #1 ++10: ++ raddhn v20.8b, v10.8h, v17.8h ++ raddhn v23.8b, v11.8h, v19.8h ++ PF ble 10f ++ PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF ldrsb DUMMY, [PF_DST, DUMMY] ++ PF add PF_DST, PF_SRC, #1 ++10: ++ raddhn v22.8b, v12.8h, v18.8h ++ st1 {v14.8h}, [DST_W], #16 ++.endm ++ ++#else ++ ++/* If we did not care much about the performance, we would just use this... */ ++.macro pixman_composite_over_8888_0565_process_pixblock_tail_head ++ pixman_composite_over_8888_0565_process_pixblock_tail ++ st1 {v14.8h}, [DST_W], #16 ++ ld1 {v4.4h, v4.5h}, [DST_R], #16 ++ fetch_src_pixblock ++ pixman_composite_over_8888_0565_process_pixblock_head ++ cache_preload 8, 8 ++.endm ++ ++#endif ++ ++/* ++ * And now the final part. We are using 'generate_composite_function' macro ++ * to put all the stuff together. We are specifying the name of the function ++ * which we want to get, number of bits per pixel for the source, mask and ++ * destination (0 if unused, like mask in this case). Next come some bit ++ * flags: ++ * FLAG_DST_READWRITE - tells that the destination buffer is both read ++ * and written, for write-only buffer we would use ++ * FLAG_DST_WRITEONLY flag instead ++ * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data ++ * and separate color channels for 32bpp format. ++ * The next things are: ++ * - the number of pixels processed per iteration (8 in this case, because ++ * that's the maximum what can fit into four 64-bit NEON registers). ++ * - prefetch distance, measured in pixel blocks. In this case it is 5 times ++ * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal ++ * prefetch distance can be selected by running some benchmarks. ++ * ++ * After that we specify some macros, these are 'default_init', ++ * 'default_cleanup' here which are empty (but it is possible to have custom ++ * init/cleanup macros to be able to save/restore some extra NEON registers ++ * like d8-d15 or do anything else) followed by ++ * 'pixman_composite_over_8888_0565_process_pixblock_head', ++ * 'pixman_composite_over_8888_0565_process_pixblock_tail' and ++ * 'pixman_composite_over_8888_0565_process_pixblock_tail_head' ++ * which we got implemented above. ++ * ++ * The last part is the NEON registers allocation scheme. ++ */ ++generate_composite_function \ ++ pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \ ++ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 5, /* prefetch distance */ \ ++ default_init, \ ++ default_cleanup, \ ++ pixman_composite_over_8888_0565_process_pixblock_head, \ ++ pixman_composite_over_8888_0565_process_pixblock_tail, \ ++ pixman_composite_over_8888_0565_process_pixblock_tail_head, \ ++ 28, /* dst_w_basereg */ \ ++ 4, /* dst_r_basereg */ \ ++ 0, /* src_basereg */ \ ++ 24 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_over_n_0565_process_pixblock_head ++ /* convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format ++ and put data into v6 - red, v7 - green, v30 - blue */ ++ mov v4.d[1], v5.d[0] ++ shrn v6.8b, v4.8h, #8 ++ shrn v7.8b, v4.8h, #3 ++ sli v4.8h, v4.8h, #5 ++ sri v6.8b, v6.8b, #5 ++ sri v7.8b, v7.8b, #6 ++ shrn v30.8b, v4.8h, #2 ++ /* now do alpha blending, storing results in 8-bit planar format ++ into v20 - red, v23 - green, v22 - blue */ ++ umull v10.8h, v3.8b, v6.8b ++ umull v11.8h, v3.8b, v7.8b ++ umull v12.8h, v3.8b, v30.8b ++ urshr v13.8h, v10.8h, #8 ++ urshr v14.8h, v11.8h, #8 ++ urshr v15.8h, v12.8h, #8 ++ raddhn v20.8b, v10.8h, v13.8h ++ raddhn v23.8b, v11.8h, v14.8h ++ raddhn v22.8b, v12.8h, v15.8h ++.endm ++ ++.macro pixman_composite_over_n_0565_process_pixblock_tail ++ /* ... continue alpha blending */ ++ uqadd v17.8b, v2.8b, v20.8b ++ uqadd v18.8b, v0.8b, v22.8b ++ uqadd v19.8b, v1.8b, v23.8b ++ /* convert the result to r5g6b5 and store it into {v14} */ ++ ushll v14.8h, v17.8b, #7 ++ sli v14.8h, v14.8h, #1 ++ ushll v8.8h, v19.8b, #7 ++ sli v8.8h, v8.8h, #1 ++ ushll v9.8h, v18.8b, #7 ++ sli v9.8h, v9.8h, #1 ++ sri v14.8h, v8.8h, #5 ++ sri v14.8h, v9.8h, #11 ++ mov v28.d[0], v14.d[0] ++ mov v29.d[0], v14.d[1] ++.endm ++ ++/* TODO: expand macros and do better instructions scheduling */ ++.macro pixman_composite_over_n_0565_process_pixblock_tail_head ++ pixman_composite_over_n_0565_process_pixblock_tail ++ ld1 {v4.4h, v5.4h}, [DST_R], #16 ++ st1 {v14.8h}, [DST_W], #16 ++ pixman_composite_over_n_0565_process_pixblock_head ++ cache_preload 8, 8 ++.endm ++ ++.macro pixman_composite_over_n_0565_init ++ mov v3.s[0], w4 ++ dup v0.8b, v3.b[0] ++ dup v1.8b, v3.b[1] ++ dup v2.8b, v3.b[2] ++ dup v3.8b, v3.b[3] ++ mvn v3.8b, v3.8b /* invert source alpha */ ++.endm ++ ++generate_composite_function \ ++ pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \ ++ FLAG_DST_READWRITE, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 5, /* prefetch distance */ \ ++ pixman_composite_over_n_0565_init, \ ++ default_cleanup, \ ++ pixman_composite_over_n_0565_process_pixblock_head, \ ++ pixman_composite_over_n_0565_process_pixblock_tail, \ ++ pixman_composite_over_n_0565_process_pixblock_tail_head, \ ++ 28, /* dst_w_basereg */ \ ++ 4, /* dst_r_basereg */ \ ++ 0, /* src_basereg */ \ ++ 24 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_src_8888_0565_process_pixblock_head ++ ushll v8.8h, v1.8b, #7 ++ sli v8.8h, v8.8h, #1 ++ ushll v14.8h, v2.8b, #7 ++ sli v14.8h, v14.8h, #1 ++ ushll v9.8h, v0.8b, #7 ++ sli v9.8h, v9.8h, #1 ++.endm ++ ++.macro pixman_composite_src_8888_0565_process_pixblock_tail ++ sri v14.8h, v8.8h, #5 ++ sri v14.8h, v9.8h, #11 ++ mov v28.d[0], v14.d[0] ++ mov v29.d[0], v14.d[1] ++.endm ++ ++.macro pixman_composite_src_8888_0565_process_pixblock_tail_head ++ sri v14.8h, v8.8h, #5 ++ PF add PF_X, PF_X, #8 ++ PF tst PF_CTL, #0xF ++ fetch_src_pixblock ++ PF beq 10f ++ PF add PF_X, PF_X, #8 ++ PF sub PF_CTL, PF_CTL, #1 ++10: ++ sri v14.8h, v9.8h, #11 ++ mov v28.d[0], v14.d[0] ++ mov v29.d[0], v14.d[1] ++ PF cmp PF_X, ORIG_W ++ PF lsl DUMMY, PF_X, #src_bpp_shift ++ PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] ++ ushll v8.8h, v1.8b, #7 ++ sli v8.8h, v8.8h, #1 ++ st1 {v14.8h}, [DST_W], #16 ++ PF ble 10f ++ PF sub PF_X, PF_X, ORIG_W ++ PF subs PF_CTL, PF_CTL, #0x10 ++10: ++ ushll v14.8h, v2.8b, #7 ++ sli v14.8h, v14.8h, #1 ++ PF ble 10f ++ PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF ldrsb DUMMY, [PF_SRC, DUMMY] ++ PF add PF_SRC, PF_SRC, #1 ++10: ++ ushll v9.8h, v0.8b, #7 ++ sli v9.8h, v9.8h, #1 ++.endm ++ ++generate_composite_function \ ++ pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \ ++ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 10, /* prefetch distance */ \ ++ default_init, \ ++ default_cleanup, \ ++ pixman_composite_src_8888_0565_process_pixblock_head, \ ++ pixman_composite_src_8888_0565_process_pixblock_tail, \ ++ pixman_composite_src_8888_0565_process_pixblock_tail_head ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_src_0565_8888_process_pixblock_head ++ mov v0.d[1], v1.d[0] ++ shrn v30.8b, v0.8h, #8 ++ shrn v29.8b, v0.8h, #3 ++ sli v0.8h, v0.8h, #5 ++ movi v31.8b, #255 ++ sri v30.8b, v30.8b, #5 ++ sri v29.8b, v29.8b, #6 ++ shrn v28.8b, v0.8h, #2 ++.endm ++ ++.macro pixman_composite_src_0565_8888_process_pixblock_tail ++.endm ++ ++/* TODO: expand macros and do better instructions scheduling */ ++.macro pixman_composite_src_0565_8888_process_pixblock_tail_head ++ pixman_composite_src_0565_8888_process_pixblock_tail ++ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 ++ fetch_src_pixblock ++ pixman_composite_src_0565_8888_process_pixblock_head ++ cache_preload 8, 8 ++.endm ++ ++generate_composite_function \ ++ pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \ ++ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 10, /* prefetch distance */ \ ++ default_init, \ ++ default_cleanup, \ ++ pixman_composite_src_0565_8888_process_pixblock_head, \ ++ pixman_composite_src_0565_8888_process_pixblock_tail, \ ++ pixman_composite_src_0565_8888_process_pixblock_tail_head ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_add_8_8_process_pixblock_head ++ uqadd v28.8b, v0.8b, v4.8b ++ uqadd v29.8b, v1.8b, v5.8b ++ uqadd v30.8b, v2.8b, v6.8b ++ uqadd v31.8b, v3.8b, v7.8b ++.endm ++ ++.macro pixman_composite_add_8_8_process_pixblock_tail ++.endm ++ ++.macro pixman_composite_add_8_8_process_pixblock_tail_head ++ fetch_src_pixblock ++ PF add PF_X, PF_X, #32 ++ PF tst PF_CTL, #0xF ++ ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 ++ PF beq 10f ++ PF add PF_X, PF_X, #32 ++ PF sub PF_CTL, PF_CTL, #1 ++10: ++ st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 ++ PF cmp PF_X, ORIG_W ++ PF lsl DUMMY, PF_X, #src_bpp_shift ++ PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] ++ PF lsl DUMMY, PF_X, #dst_bpp_shift ++ PF prfm PREFETCH_MODE, [PF_DST, DUMMY] ++ PF ble 10f ++ PF sub PF_X, PF_X, ORIG_W ++ PF subs PF_CTL, PF_CTL, #0x10 ++10: ++ uqadd v28.8b, v0.8b, v4.8b ++ PF ble 10f ++ PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF ldrsb DUMMY, [PF_SRC, DUMMY] ++ PF add PF_SRC, PF_SRC, #1 ++ PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF ldrsb DUMMY, [PF_DST, DUMMY] ++ PF add PF_DST, PF_DST, #1 ++10: ++ uqadd v29.8b, v1.8b, v5.8b ++ uqadd v30.8b, v2.8b, v6.8b ++ uqadd v31.8b, v3.8b, v7.8b ++.endm ++ ++generate_composite_function \ ++ pixman_composite_add_8_8_asm_neon, 8, 0, 8, \ ++ FLAG_DST_READWRITE, \ ++ 32, /* number of pixels, processed in a single block */ \ ++ 10, /* prefetch distance */ \ ++ default_init, \ ++ default_cleanup, \ ++ pixman_composite_add_8_8_process_pixblock_head, \ ++ pixman_composite_add_8_8_process_pixblock_tail, \ ++ pixman_composite_add_8_8_process_pixblock_tail_head ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_add_8888_8888_process_pixblock_tail_head ++ fetch_src_pixblock ++ PF add PF_X, PF_X, #8 ++ PF tst PF_CTL, #0xF ++ ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 ++ PF beq 10f ++ PF add PF_X, PF_X, #8 ++ PF sub PF_CTL, PF_CTL, #1 ++10: ++ st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 ++ PF cmp PF_X, ORIG_W ++ PF lsl DUMMY, PF_X, #src_bpp_shift ++ PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] ++ PF lsl DUMMY, PF_X, #dst_bpp_shift ++ PF prfm PREFETCH_MODE, [PF_DST, DUMMY] ++ PF ble 10f ++ PF sub PF_X, PF_X, ORIG_W ++ PF subs PF_CTL, PF_CTL, #0x10 ++10: ++ uqadd v28.8b, v0.8b, v4.8b ++ PF ble 10f ++ PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF ldrsb DUMMY, [PF_SRC, DUMMY] ++ PF add PF_SRC, PF_SRC, #1 ++ PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF ldrsb DUMMY, [PF_DST, DUMMY] ++ PF add PF_DST, PF_DST, #1 ++10: ++ uqadd v29.8b, v1.8b, v5.8b ++ uqadd v30.8b, v2.8b, v6.8b ++ uqadd v31.8b, v3.8b, v7.8b ++.endm ++ ++generate_composite_function \ ++ pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \ ++ FLAG_DST_READWRITE, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 10, /* prefetch distance */ \ ++ default_init, \ ++ default_cleanup, \ ++ pixman_composite_add_8_8_process_pixblock_head, \ ++ pixman_composite_add_8_8_process_pixblock_tail, \ ++ pixman_composite_add_8888_8888_process_pixblock_tail_head ++ ++generate_composite_function_single_scanline \ ++ pixman_composite_scanline_add_asm_neon, 32, 0, 32, \ ++ FLAG_DST_READWRITE, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ default_init, \ ++ default_cleanup, \ ++ pixman_composite_add_8_8_process_pixblock_head, \ ++ pixman_composite_add_8_8_process_pixblock_tail, \ ++ pixman_composite_add_8888_8888_process_pixblock_tail_head ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head ++ mvn v24.8b, v3.8b /* get inverted alpha */ ++ /* do alpha blending */ ++ umull v8.8h, v24.8b, v4.8b ++ umull v9.8h, v24.8b, v5.8b ++ umull v10.8h, v24.8b, v6.8b ++ umull v11.8h, v24.8b, v7.8b ++.endm ++ ++.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail ++ urshr v14.8h, v8.8h, #8 ++ urshr v15.8h, v9.8h, #8 ++ urshr v16.8h, v10.8h, #8 ++ urshr v17.8h, v11.8h, #8 ++ raddhn v28.8b, v14.8h, v8.8h ++ raddhn v29.8b, v15.8h, v9.8h ++ raddhn v30.8b, v16.8h, v10.8h ++ raddhn v31.8b, v17.8h, v11.8h ++.endm ++ ++.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head ++ ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 ++ urshr v14.8h, v8.8h, #8 ++ PF add PF_X, PF_X, #8 ++ PF tst PF_CTL, #0xF ++ urshr v15.8h, v9.8h, #8 ++ urshr v16.8h, v10.8h, #8 ++ urshr v17.8h, v11.8h, #8 ++ PF beq 10f ++ PF add PF_X, PF_X, #8 ++ PF sub PF_CTL, PF_CTL, #1 ++10: ++ raddhn v28.8b, v14.8h, v8.8h ++ raddhn v29.8b, v15.8h, v9.8h ++ PF cmp PF_X, ORIG_W ++ raddhn v30.8b, v16.8h, v10.8h ++ raddhn v31.8b, v17.8h, v11.8h ++ fetch_src_pixblock ++ PF lsl DUMMY, PF_X, #src_bpp_shift ++ PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] ++ mvn v22.8b, v3.8b ++ PF lsl DUMMY, PF_X, #dst_bpp_shift ++ PF prfm PREFETCH_MODE, [PF_DST, DUMMY] ++ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 ++ PF ble 10f ++ PF sub PF_X, PF_X, ORIG_W ++10: ++ umull v8.8h, v22.8b, v4.8b ++ PF ble 10f ++ PF subs PF_CTL, PF_CTL, #0x10 ++10: ++ umull v9.8h, v22.8b, v5.8b ++ PF ble 10f ++ PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF ldrsb DUMMY, [PF_SRC, DUMMY] ++ PF add PF_SRC, PF_SRC, #1 ++10: ++ umull v10.8h, v22.8b, v6.8b ++ PF ble 10f ++ PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF ldrsb DUMMY, [PF_DST, DUMMY] ++ PF add PF_DST, PF_DST, #1 ++10: ++ umull v11.8h, v22.8b, v7.8b ++.endm ++ ++generate_composite_function_single_scanline \ ++ pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \ ++ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ default_init, \ ++ default_cleanup, \ ++ pixman_composite_out_reverse_8888_8888_process_pixblock_head, \ ++ pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \ ++ pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_over_8888_8888_process_pixblock_head ++ pixman_composite_out_reverse_8888_8888_process_pixblock_head ++.endm ++ ++.macro pixman_composite_over_8888_8888_process_pixblock_tail ++ pixman_composite_out_reverse_8888_8888_process_pixblock_tail ++ uqadd v28.8b, v0.8b, v28.8b ++ uqadd v29.8b, v1.8b, v29.8b ++ uqadd v30.8b, v2.8b, v30.8b ++ uqadd v31.8b, v3.8b, v31.8b ++.endm ++ ++.macro pixman_composite_over_8888_8888_process_pixblock_tail_head ++ ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 ++ urshr v14.8h, v8.8h, #8 ++ PF add PF_X, PF_X, #8 ++ PF tst PF_CTL, #0xF ++ urshr v15.8h, v9.8h, #8 ++ urshr v16.8h, v10.8h, #8 ++ urshr v17.8h, v11.8h, #8 ++ PF beq 10f ++ PF add PF_X, PF_X, #8 ++ PF sub PF_CTL, PF_CTL, #1 ++10: ++ raddhn v28.8b, v14.8h, v8.8h ++ raddhn v29.8b, v15.8h, v9.8h ++ PF cmp PF_X, ORIG_W ++ raddhn v30.8b, v16.8h, v10.8h ++ raddhn v31.8b, v17.8h, v11.8h ++ uqadd v28.8b, v0.8b, v28.8b ++ uqadd v29.8b, v1.8b, v29.8b ++ uqadd v30.8b, v2.8b, v30.8b ++ uqadd v31.8b, v3.8b, v31.8b ++ fetch_src_pixblock ++ PF lsl DUMMY, PF_X, #src_bpp_shift ++ PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] ++ mvn v22.8b, v3.8b ++ PF lsl DUMMY, PF_X, #dst_bpp_shift ++ PF prfm PREFETCH_MODE, [PF_DST, DUMMY] ++ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 ++ PF ble 10f ++ PF sub PF_X, PF_X, ORIG_W ++10: ++ umull v8.8h, v22.8b, v4.8b ++ PF ble 10f ++ PF subs PF_CTL, PF_CTL, #0x10 ++10: ++ umull v9.8h, v22.8b, v5.8b ++ PF ble 10f ++ PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF ldrsb DUMMY, [PF_SRC, DUMMY] ++ PF add PF_SRC, PF_SRC, #1 ++10: ++ umull v10.8h, v22.8b, v6.8b ++ PF ble 10f ++ PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF ldrsb DUMMY, [PF_DST, DUMMY] ++ PF add PF_DST, PF_DST, #1 ++10: ++ umull v11.8h, v22.8b, v7.8b ++.endm ++ ++generate_composite_function \ ++ pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \ ++ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 5, /* prefetch distance */ \ ++ default_init, \ ++ default_cleanup, \ ++ pixman_composite_over_8888_8888_process_pixblock_head, \ ++ pixman_composite_over_8888_8888_process_pixblock_tail, \ ++ pixman_composite_over_8888_8888_process_pixblock_tail_head ++ ++generate_composite_function_single_scanline \ ++ pixman_composite_scanline_over_asm_neon, 32, 0, 32, \ ++ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ default_init, \ ++ default_cleanup, \ ++ pixman_composite_over_8888_8888_process_pixblock_head, \ ++ pixman_composite_over_8888_8888_process_pixblock_tail, \ ++ pixman_composite_over_8888_8888_process_pixblock_tail_head ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_over_n_8888_process_pixblock_head ++ /* deinterleaved source pixels in {v0, v1, v2, v3} */ ++ /* inverted alpha in {v24} */ ++ /* destination pixels in {v4, v5, v6, v7} */ ++ umull v8.8h, v24.8b, v4.8b ++ umull v9.8h, v24.8b, v5.8b ++ umull v10.8h, v24.8b, v6.8b ++ umull v11.8h, v24.8b, v7.8b ++.endm ++ ++.macro pixman_composite_over_n_8888_process_pixblock_tail ++ urshr v14.8h, v8.8h, #8 ++ urshr v15.8h, v9.8h, #8 ++ urshr v16.8h, v10.8h, #8 ++ urshr v17.8h, v11.8h, #8 ++ raddhn v28.8b, v14.8h, v8.8h ++ raddhn v29.8b, v15.8h, v9.8h ++ raddhn v30.8b, v16.8h, v10.8h ++ raddhn v31.8b, v17.8h, v11.8h ++ uqadd v28.8b, v0.8b, v28.8b ++ uqadd v29.8b, v1.8b, v29.8b ++ uqadd v30.8b, v2.8b, v30.8b ++ uqadd v31.8b, v3.8b, v31.8b ++.endm ++ ++.macro pixman_composite_over_n_8888_process_pixblock_tail_head ++ urshr v14.8h, v8.8h, #8 ++ urshr v15.8h, v9.8h, #8 ++ urshr v16.8h, v10.8h, #8 ++ urshr v17.8h, v11.8h, #8 ++ raddhn v28.8b, v14.8h, v8.8h ++ raddhn v29.8b, v15.8h, v9.8h ++ raddhn v30.8b, v16.8h, v10.8h ++ raddhn v31.8b, v17.8h, v11.8h ++ ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 ++ uqadd v28.8b, v0.8b, v28.8b ++ PF add PF_X, PF_X, #8 ++ PF tst PF_CTL, #0x0F ++ PF beq 10f ++ PF add PF_X, PF_X, #8 ++ PF sub PF_CTL, PF_CTL, #1 ++10: ++ uqadd v29.8b, v1.8b, v29.8b ++ uqadd v30.8b, v2.8b, v30.8b ++ uqadd v31.8b, v3.8b, v31.8b ++ PF cmp PF_X, ORIG_W ++ umull v8.8h, v24.8b, v4.8b ++ PF lsl DUMMY, PF_X, #dst_bpp_shift ++ PF prfm PREFETCH_MODE, [PF_DST, DUMMY] ++ umull v9.8h, v24.8b, v5.8b ++ PF ble 10f ++ PF sub PF_X, PF_X, ORIG_W ++10: ++ umull v10.8h, v24.8b, v6.8b ++ PF subs PF_CTL, PF_CTL, #0x10 ++ umull v11.8h, v24.8b, v7.8b ++ PF ble 10f ++ PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF ldrsb DUMMY, [PF_DST, DUMMY] ++ PF add PF_DST, PF_DST, #1 ++10: ++ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 ++.endm ++ ++.macro pixman_composite_over_n_8888_init ++ mov v3.s[0], w4 ++ dup v0.8b, v3.b[0] ++ dup v1.8b, v3.b[1] ++ dup v2.8b, v3.b[2] ++ dup v3.8b, v3.b[3] ++ mvn v24.8b, v3.8b /* get inverted alpha */ ++.endm ++ ++generate_composite_function \ ++ pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \ ++ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 5, /* prefetch distance */ \ ++ pixman_composite_over_n_8888_init, \ ++ default_cleanup, \ ++ pixman_composite_over_8888_8888_process_pixblock_head, \ ++ pixman_composite_over_8888_8888_process_pixblock_tail, \ ++ pixman_composite_over_n_8888_process_pixblock_tail_head ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head ++ urshr v14.8h, v8.8h, #8 ++ PF add PF_X, PF_X, #8 ++ PF tst PF_CTL, #0xF ++ urshr v15.8h, v9.8h, #8 ++ urshr v12.8h, v10.8h, #8 ++ urshr v13.8h, v11.8h, #8 ++ PF beq 10f ++ PF add PF_X, PF_X, #8 ++ PF sub PF_CTL, PF_CTL, #1 ++10: ++ raddhn v28.8b, v14.8h, v8.8h ++ raddhn v29.8b, v15.8h, v9.8h ++ PF cmp PF_X, ORIG_W ++ raddhn v30.8b, v12.8h, v10.8h ++ raddhn v31.8b, v13.8h, v11.8h ++ uqadd v28.8b, v0.8b, v28.8b ++ uqadd v29.8b, v1.8b, v29.8b ++ uqadd v30.8b, v2.8b, v30.8b ++ uqadd v31.8b, v3.8b, v31.8b ++ ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_R], #32 ++ mvn v22.8b, v3.8b ++ PF lsl DUMMY, PF_X, #dst_bpp_shift ++ PF prfm PREFETCH_MODE, [PF_DST, DUMMY] ++ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 ++ PF blt 10f ++ PF sub PF_X, PF_X, ORIG_W ++10: ++ umull v8.8h, v22.8b, v4.8b ++ PF blt 10f ++ PF subs PF_CTL, PF_CTL, #0x10 ++10: ++ umull v9.8h, v22.8b, v5.8b ++ umull v10.8h, v22.8b, v6.8b ++ PF blt 10f ++ PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF ldrsb DUMMY, [PF_DST, DUMMY] ++ PF add PF_DST, PF_DST, #1 ++10: ++ umull v11.8h, v22.8b, v7.8b ++.endm ++ ++.macro pixman_composite_over_reverse_n_8888_init ++ mov v7.s[0], w4 ++ dup v4.8b, v7.b[0] ++ dup v5.8b, v7.b[1] ++ dup v6.8b, v7.b[2] ++ dup v7.8b, v7.b[3] ++.endm ++ ++generate_composite_function \ ++ pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \ ++ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 5, /* prefetch distance */ \ ++ pixman_composite_over_reverse_n_8888_init, \ ++ default_cleanup, \ ++ pixman_composite_over_8888_8888_process_pixblock_head, \ ++ pixman_composite_over_8888_8888_process_pixblock_tail, \ ++ pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \ ++ 28, /* dst_w_basereg */ \ ++ 0, /* dst_r_basereg */ \ ++ 4, /* src_basereg */ \ ++ 24 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_over_8888_8_0565_process_pixblock_head ++ umull v0.8h, v24.8b, v8.8b /* IN for SRC pixels (part1) */ ++ umull v1.8h, v24.8b, v9.8b ++ umull v2.8h, v24.8b, v10.8b ++ umull v3.8h, v24.8b, v11.8b ++ mov v4.d[1], v5.d[0] ++ shrn v25.8b, v4.8h, #8 /* convert DST_R data to 32-bpp (part1) */ ++ shrn v26.8b, v4.8h, #3 ++ sli v4.8h, v4.8h, #5 ++ urshr v17.8h, v0.8h, #8 /* IN for SRC pixels (part2) */ ++ urshr v18.8h, v1.8h, #8 ++ urshr v19.8h, v2.8h, #8 ++ urshr v20.8h, v3.8h, #8 ++ raddhn v0.8b, v0.8h, v17.8h ++ raddhn v1.8b, v1.8h, v18.8h ++ raddhn v2.8b, v2.8h, v19.8h ++ raddhn v3.8b, v3.8h, v20.8h ++ sri v25.8b, v25.8b, #5 /* convert DST_R data to 32-bpp (part2) */ ++ sri v26.8b, v26.8b, #6 ++ mvn v3.8b, v3.8b ++ shrn v30.8b, v4.8h, #2 ++ umull v18.8h, v3.8b, v25.8b /* now do alpha blending */ ++ umull v19.8h, v3.8b, v26.8b ++ umull v20.8h, v3.8b, v30.8b ++.endm ++ ++.macro pixman_composite_over_8888_8_0565_process_pixblock_tail ++ /* 3 cycle bubble (after vmull.u8) */ ++ urshr v5.8h, v18.8h, #8 ++ urshr v6.8h, v19.8h, #8 ++ urshr v7.8h, v20.8h, #8 ++ raddhn v17.8b, v18.8h, v5.8h ++ raddhn v19.8b, v19.8h, v6.8h ++ raddhn v18.8b, v20.8h, v7.8h ++ uqadd v5.8b, v2.8b, v17.8b ++ /* 1 cycle bubble */ ++ uqadd v6.8b, v0.8b, v18.8b ++ uqadd v7.8b, v1.8b, v19.8b ++ ushll v14.8h, v5.8b, #7 /* convert to 16bpp */ ++ sli v14.8h, v14.8h, #1 ++ ushll v18.8h, v7.8b, #7 ++ sli v18.8h, v18.8h, #1 ++ ushll v19.8h, v6.8b, #7 ++ sli v19.8h, v19.8h, #1 ++ sri v14.8h, v18.8h, #5 ++ /* 1 cycle bubble */ ++ sri v14.8h, v19.8h, #11 ++ mov v28.d[0], v14.d[0] ++ mov v29.d[0], v14.d[1] ++.endm ++ ++.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head ++#if 0 ++ ld1 {v4.8h}, [DST_R], #16 ++ shrn v25.8b, v4.8h, #8 ++ fetch_mask_pixblock ++ shrn v26.8b, v4.8h, #3 ++ fetch_src_pixblock ++ umull v22.8h, v24.8b, v10.8b ++ urshr v13.8h, v18.8h, #8 ++ urshr v11.8h, v19.8h, #8 ++ urshr v15.8h, v20.8h, #8 ++ raddhn v17.8b, v18.8h, v13.8h ++ raddhn v19.8b, v19.8h, v11.8h ++ raddhn v18.8b, v20.8h, v15.8h ++ uqadd v17.8b, v2.8b, v17.8b ++ umull v21.8h, v24.8b, v9.8b ++ uqadd v18.8b, v0.8b, v18.8b ++ uqadd v19.8b, v1.8b, v19.8b ++ ushll v14.8h, v17.8b, #7 ++ sli v14.8h, v14.8h, #1 ++ umull v20.8h, v24.8b, v8.8b ++ ushll v18.8h, v18.8b, #7 ++ sli v18.8h, v18.8h, #1 ++ ushll v19.8h, v19.8b, #7 ++ sli v19.8h, v19.8h, #1 ++ sri v14.8h, v18.8h, #5 ++ umull v23.8h, v24.8b, v11.8b ++ sri v14.8h, v19.8h, #11 ++ mov v28.d[0], v14.d[0] ++ mov v29.d[0], v14.d[1] ++ ++ cache_preload 8, 8 ++ ++ sli v4.8h, v4.8h, #5 ++ urshr v16.8h, v20.8h, #8 ++ urshr v17.8h, v21.8h, #8 ++ urshr v18.8h, v22.8h, #8 ++ urshr v19.8h, v23.8h, #8 ++ raddhn v0.8b, v20.8h, v16.8h ++ raddhn v1.8b, v21.8h, v17.8h ++ raddhn v2.8b, v22.8h, v18.8h ++ raddhn v3.8b, v23.8h, v19.8h ++ sri v25.8b, v25.8b, #5 ++ sri v26.8b, v26.8b, #6 ++ mvn v3.8b, v3.8b ++ shrn v30.8b, v4.8h, #2 ++ st1 {v14.8h}, [DST_W], #16 ++ umull v18.8h, v3.8b, v25.8b ++ umull v19.8h, v3.8b, v26.8b ++ umull v20.8h, v3.8b, v30.8b ++#else ++ pixman_composite_over_8888_8_0565_process_pixblock_tail ++ st1 {v28.4h, v29.4h}, [DST_W], #16 ++ ld1 {v4.4h, v5.4h}, [DST_R], #16 ++ fetch_mask_pixblock ++ fetch_src_pixblock ++ pixman_composite_over_8888_8_0565_process_pixblock_head ++#endif ++.endm ++ ++generate_composite_function \ ++ pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \ ++ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 5, /* prefetch distance */ \ ++ default_init_need_all_regs, \ ++ default_cleanup_need_all_regs, \ ++ pixman_composite_over_8888_8_0565_process_pixblock_head, \ ++ pixman_composite_over_8888_8_0565_process_pixblock_tail, \ ++ pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ ++ 28, /* dst_w_basereg */ \ ++ 4, /* dst_r_basereg */ \ ++ 8, /* src_basereg */ \ ++ 24 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++/* ++ * This function needs a special initialization of solid mask. ++ * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET ++ * offset, split into color components and replicated in d8-d11 ++ * registers. Additionally, this function needs all the NEON registers, ++ * so it has to save d8-d15 registers which are callee saved according ++ * to ABI. These registers are restored from 'cleanup' macro. All the ++ * other NEON registers are caller saved, so can be clobbered freely ++ * without introducing any problems. ++ */ ++.macro pixman_composite_over_n_8_0565_init ++ mov v11.s[0], w4 ++ dup v8.8b, v11.b[0] ++ dup v9.8b, v11.b[1] ++ dup v10.8b, v11.b[2] ++ dup v11.8b, v11.b[3] ++.endm ++ ++.macro pixman_composite_over_n_8_0565_cleanup ++.endm ++ ++generate_composite_function \ ++ pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \ ++ FLAG_DST_READWRITE, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 5, /* prefetch distance */ \ ++ pixman_composite_over_n_8_0565_init, \ ++ pixman_composite_over_n_8_0565_cleanup, \ ++ pixman_composite_over_8888_8_0565_process_pixblock_head, \ ++ pixman_composite_over_8888_8_0565_process_pixblock_tail, \ ++ pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ ++ 28, /* dst_w_basereg */ \ ++ 4, /* dst_r_basereg */ \ ++ 8, /* src_basereg */ \ ++ 24 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_over_8888_n_0565_init ++ mov v24.s[0], w6 ++ dup v24.8b, v24.b[3] ++.endm ++ ++.macro pixman_composite_over_8888_n_0565_cleanup ++.endm ++ ++generate_composite_function \ ++ pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \ ++ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 5, /* prefetch distance */ \ ++ pixman_composite_over_8888_n_0565_init, \ ++ pixman_composite_over_8888_n_0565_cleanup, \ ++ pixman_composite_over_8888_8_0565_process_pixblock_head, \ ++ pixman_composite_over_8888_8_0565_process_pixblock_tail, \ ++ pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ ++ 28, /* dst_w_basereg */ \ ++ 4, /* dst_r_basereg */ \ ++ 8, /* src_basereg */ \ ++ 24 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_src_0565_0565_process_pixblock_head ++.endm ++ ++.macro pixman_composite_src_0565_0565_process_pixblock_tail ++.endm ++ ++.macro pixman_composite_src_0565_0565_process_pixblock_tail_head ++ st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DST_W], #32 ++ fetch_src_pixblock ++ cache_preload 16, 16 ++.endm ++ ++generate_composite_function \ ++ pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \ ++ FLAG_DST_WRITEONLY, \ ++ 16, /* number of pixels, processed in a single block */ \ ++ 10, /* prefetch distance */ \ ++ default_init, \ ++ default_cleanup, \ ++ pixman_composite_src_0565_0565_process_pixblock_head, \ ++ pixman_composite_src_0565_0565_process_pixblock_tail, \ ++ pixman_composite_src_0565_0565_process_pixblock_tail_head, \ ++ 0, /* dst_w_basereg */ \ ++ 0, /* dst_r_basereg */ \ ++ 0, /* src_basereg */ \ ++ 0 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_src_n_8_process_pixblock_head ++.endm ++ ++.macro pixman_composite_src_n_8_process_pixblock_tail ++.endm ++ ++.macro pixman_composite_src_n_8_process_pixblock_tail_head ++ st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_W], 32 ++.endm ++ ++.macro pixman_composite_src_n_8_init ++ mov v0.s[0], w4 ++ dup v3.8b, v0.b[0] ++ dup v2.8b, v0.b[0] ++ dup v1.8b, v0.b[0] ++ dup v0.8b, v0.b[0] ++.endm ++ ++.macro pixman_composite_src_n_8_cleanup ++.endm ++ ++generate_composite_function \ ++ pixman_composite_src_n_8_asm_neon, 0, 0, 8, \ ++ FLAG_DST_WRITEONLY, \ ++ 32, /* number of pixels, processed in a single block */ \ ++ 0, /* prefetch distance */ \ ++ pixman_composite_src_n_8_init, \ ++ pixman_composite_src_n_8_cleanup, \ ++ pixman_composite_src_n_8_process_pixblock_head, \ ++ pixman_composite_src_n_8_process_pixblock_tail, \ ++ pixman_composite_src_n_8_process_pixblock_tail_head, \ ++ 0, /* dst_w_basereg */ \ ++ 0, /* dst_r_basereg */ \ ++ 0, /* src_basereg */ \ ++ 0 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_src_n_0565_process_pixblock_head ++.endm ++ ++.macro pixman_composite_src_n_0565_process_pixblock_tail ++.endm ++ ++.macro pixman_composite_src_n_0565_process_pixblock_tail_head ++ st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [DST_W], #32 ++.endm ++ ++.macro pixman_composite_src_n_0565_init ++ mov v0.s[0], w4 ++ dup v3.4h, v0.h[0] ++ dup v2.4h, v0.h[0] ++ dup v1.4h, v0.h[0] ++ dup v0.4h, v0.h[0] ++.endm ++ ++.macro pixman_composite_src_n_0565_cleanup ++.endm ++ ++generate_composite_function \ ++ pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \ ++ FLAG_DST_WRITEONLY, \ ++ 16, /* number of pixels, processed in a single block */ \ ++ 0, /* prefetch distance */ \ ++ pixman_composite_src_n_0565_init, \ ++ pixman_composite_src_n_0565_cleanup, \ ++ pixman_composite_src_n_0565_process_pixblock_head, \ ++ pixman_composite_src_n_0565_process_pixblock_tail, \ ++ pixman_composite_src_n_0565_process_pixblock_tail_head, \ ++ 0, /* dst_w_basereg */ \ ++ 0, /* dst_r_basereg */ \ ++ 0, /* src_basereg */ \ ++ 0 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_src_n_8888_process_pixblock_head ++.endm ++ ++.macro pixman_composite_src_n_8888_process_pixblock_tail ++.endm ++ ++.macro pixman_composite_src_n_8888_process_pixblock_tail_head ++ st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32 ++.endm ++ ++.macro pixman_composite_src_n_8888_init ++ mov v0.s[0], w4 ++ dup v3.2s, v0.s[0] ++ dup v2.2s, v0.s[0] ++ dup v1.2s, v0.s[0] ++ dup v0.2s, v0.s[0] ++.endm ++ ++.macro pixman_composite_src_n_8888_cleanup ++.endm ++ ++generate_composite_function \ ++ pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \ ++ FLAG_DST_WRITEONLY, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 0, /* prefetch distance */ \ ++ pixman_composite_src_n_8888_init, \ ++ pixman_composite_src_n_8888_cleanup, \ ++ pixman_composite_src_n_8888_process_pixblock_head, \ ++ pixman_composite_src_n_8888_process_pixblock_tail, \ ++ pixman_composite_src_n_8888_process_pixblock_tail_head, \ ++ 0, /* dst_w_basereg */ \ ++ 0, /* dst_r_basereg */ \ ++ 0, /* src_basereg */ \ ++ 0 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_src_8888_8888_process_pixblock_head ++.endm ++ ++.macro pixman_composite_src_8888_8888_process_pixblock_tail ++.endm ++ ++.macro pixman_composite_src_8888_8888_process_pixblock_tail_head ++ st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32 ++ fetch_src_pixblock ++ cache_preload 8, 8 ++.endm ++ ++generate_composite_function \ ++ pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \ ++ FLAG_DST_WRITEONLY, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 10, /* prefetch distance */ \ ++ default_init, \ ++ default_cleanup, \ ++ pixman_composite_src_8888_8888_process_pixblock_head, \ ++ pixman_composite_src_8888_8888_process_pixblock_tail, \ ++ pixman_composite_src_8888_8888_process_pixblock_tail_head, \ ++ 0, /* dst_w_basereg */ \ ++ 0, /* dst_r_basereg */ \ ++ 0, /* src_basereg */ \ ++ 0 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_src_x888_8888_process_pixblock_head ++ orr v0.8b, v0.8b, v4.8b ++ orr v1.8b, v1.8b, v4.8b ++ orr v2.8b, v2.8b, v4.8b ++ orr v3.8b, v3.8b, v4.8b ++.endm ++ ++.macro pixman_composite_src_x888_8888_process_pixblock_tail ++.endm ++ ++.macro pixman_composite_src_x888_8888_process_pixblock_tail_head ++ st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32 ++ fetch_src_pixblock ++ orr v0.8b, v0.8b, v4.8b ++ orr v1.8b, v1.8b, v4.8b ++ orr v2.8b, v2.8b, v4.8b ++ orr v3.8b, v3.8b, v4.8b ++ cache_preload 8, 8 ++.endm ++ ++.macro pixman_composite_src_x888_8888_init ++ movi v4.2s, #0xff, lsl 24 ++.endm ++ ++generate_composite_function \ ++ pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \ ++ FLAG_DST_WRITEONLY, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 10, /* prefetch distance */ \ ++ pixman_composite_src_x888_8888_init, \ ++ default_cleanup, \ ++ pixman_composite_src_x888_8888_process_pixblock_head, \ ++ pixman_composite_src_x888_8888_process_pixblock_tail, \ ++ pixman_composite_src_x888_8888_process_pixblock_tail_head, \ ++ 0, /* dst_w_basereg */ \ ++ 0, /* dst_r_basereg */ \ ++ 0, /* src_basereg */ \ ++ 0 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_src_n_8_8888_process_pixblock_head ++ /* expecting solid source in {v0, v1, v2, v3} */ ++ /* mask is in v24 (v25, v26, v27 are unused) */ ++ ++ /* in */ ++ umull v8.8h, v24.8b, v0.8b ++ umull v9.8h, v24.8b, v1.8b ++ umull v10.8h, v24.8b, v2.8b ++ umull v11.8h, v24.8b, v3.8b ++ ursra v8.8h, v8.8h, #8 ++ ursra v9.8h, v9.8h, #8 ++ ursra v10.8h, v10.8h, #8 ++ ursra v11.8h, v11.8h, #8 ++.endm ++ ++.macro pixman_composite_src_n_8_8888_process_pixblock_tail ++ rshrn v28.8b, v8.8h, #8 ++ rshrn v29.8b, v9.8h, #8 ++ rshrn v30.8b, v10.8h, #8 ++ rshrn v31.8b, v11.8h, #8 ++.endm ++ ++.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head ++ fetch_mask_pixblock ++ PF add PF_X, PF_X, #8 ++ rshrn v28.8b, v8.8h, #8 ++ PF tst PF_CTL, #0x0F ++ rshrn v29.8b, v9.8h, #8 ++ PF beq 10f ++ PF add PF_X, PF_X, #8 ++10: ++ rshrn v30.8b, v10.8h, #8 ++ PF beq 10f ++ PF sub PF_CTL, PF_CTL, #1 ++10: ++ rshrn v31.8b, v11.8h, #8 ++ PF cmp PF_X, ORIG_W ++ umull v8.8h, v24.8b, v0.8b ++ PF lsl DUMMY, PF_X, #mask_bpp_shift ++ PF prfm PREFETCH_MODE, [PF_MASK, DUMMY] ++ umull v9.8h, v24.8b, v1.8b ++ PF ble 10f ++ PF sub PF_X, PF_X, ORIG_W ++10: ++ umull v10.8h, v24.8b, v2.8b ++ PF ble 10f ++ PF subs PF_CTL, PF_CTL, #0x10 ++10: ++ umull v11.8h, v24.8b, v3.8b ++ PF ble 10f ++ PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift ++ PF ldrsb DUMMY, [PF_MASK, DUMMY] ++ PF add PF_MASK, PF_MASK, #1 ++10: ++ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 ++ ursra v8.8h, v8.8h, #8 ++ ursra v9.8h, v9.8h, #8 ++ ursra v10.8h, v10.8h, #8 ++ ursra v11.8h, v11.8h, #8 ++.endm ++ ++.macro pixman_composite_src_n_8_8888_init ++ mov v3.s[0], w4 ++ dup v0.8b, v3.b[0] ++ dup v1.8b, v3.b[1] ++ dup v2.8b, v3.b[2] ++ dup v3.8b, v3.b[3] ++.endm ++ ++.macro pixman_composite_src_n_8_8888_cleanup ++.endm ++ ++generate_composite_function \ ++ pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \ ++ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 5, /* prefetch distance */ \ ++ pixman_composite_src_n_8_8888_init, \ ++ pixman_composite_src_n_8_8888_cleanup, \ ++ pixman_composite_src_n_8_8888_process_pixblock_head, \ ++ pixman_composite_src_n_8_8888_process_pixblock_tail, \ ++ pixman_composite_src_n_8_8888_process_pixblock_tail_head, \ ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_src_n_8_8_process_pixblock_head ++ umull v0.8h, v24.8b, v16.8b ++ umull v1.8h, v25.8b, v16.8b ++ umull v2.8h, v26.8b, v16.8b ++ umull v3.8h, v27.8b, v16.8b ++ ursra v0.8h, v0.8h, #8 ++ ursra v1.8h, v1.8h, #8 ++ ursra v2.8h, v2.8h, #8 ++ ursra v3.8h, v3.8h, #8 ++.endm ++ ++.macro pixman_composite_src_n_8_8_process_pixblock_tail ++ rshrn v28.8b, v0.8h, #8 ++ rshrn v29.8b, v1.8h, #8 ++ rshrn v30.8b, v2.8h, #8 ++ rshrn v31.8b, v3.8h, #8 ++.endm ++ ++.macro pixman_composite_src_n_8_8_process_pixblock_tail_head ++ fetch_mask_pixblock ++ PF add PF_X, PF_X, #8 ++ rshrn v28.8b, v0.8h, #8 ++ PF tst PF_CTL, #0x0F ++ rshrn v29.8b, v1.8h, #8 ++ PF beq 10f ++ PF add PF_X, PF_X, #8 ++10: ++ rshrn v30.8b, v2.8h, #8 ++ PF beq 10f ++ PF sub PF_CTL, PF_CTL, #1 ++10: ++ rshrn v31.8b, v3.8h, #8 ++ PF cmp PF_X, ORIG_W ++ umull v0.8h, v24.8b, v16.8b ++ PF lsl DUMMY, PF_X, mask_bpp_shift ++ PF prfm PREFETCH_MODE, [PF_MASK, DUMMY] ++ umull v1.8h, v25.8b, v16.8b ++ PF ble 10f ++ PF sub PF_X, PF_X, ORIG_W ++10: ++ umull v2.8h, v26.8b, v16.8b ++ PF ble 10f ++ PF subs PF_CTL, PF_CTL, #0x10 ++10: ++ umull v3.8h, v27.8b, v16.8b ++ PF ble 10f ++ PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift ++ PF ldrsb DUMMY, [PF_MASK, DUMMY] ++ PF add PF_MASK, PF_MASK, #1 ++10: ++ st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 ++ ursra v0.8h, v0.8h, #8 ++ ursra v1.8h, v1.8h, #8 ++ ursra v2.8h, v2.8h, #8 ++ ursra v3.8h, v3.8h, #8 ++.endm ++ ++.macro pixman_composite_src_n_8_8_init ++ mov v16.s[0], w4 ++ dup v16.8b, v16.b[3] ++.endm ++ ++.macro pixman_composite_src_n_8_8_cleanup ++.endm ++ ++generate_composite_function \ ++ pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \ ++ FLAG_DST_WRITEONLY, \ ++ 32, /* number of pixels, processed in a single block */ \ ++ 5, /* prefetch distance */ \ ++ pixman_composite_src_n_8_8_init, \ ++ pixman_composite_src_n_8_8_cleanup, \ ++ pixman_composite_src_n_8_8_process_pixblock_head, \ ++ pixman_composite_src_n_8_8_process_pixblock_tail, \ ++ pixman_composite_src_n_8_8_process_pixblock_tail_head ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_over_n_8_8888_process_pixblock_head ++ /* expecting deinterleaved source data in {v8, v9, v10, v11} */ ++ /* v8 - blue, v9 - green, v10 - red, v11 - alpha */ ++ /* and destination data in {v4, v5, v6, v7} */ ++ /* mask is in v24 (v25, v26, v27 are unused) */ ++ ++ /* in */ ++ umull v12.8h, v24.8b, v8.8b ++ umull v13.8h, v24.8b, v9.8b ++ umull v14.8h, v24.8b, v10.8b ++ umull v15.8h, v24.8b, v11.8b ++ urshr v16.8h, v12.8h, #8 ++ urshr v17.8h, v13.8h, #8 ++ urshr v18.8h, v14.8h, #8 ++ urshr v19.8h, v15.8h, #8 ++ raddhn v0.8b, v12.8h, v16.8h ++ raddhn v1.8b, v13.8h, v17.8h ++ raddhn v2.8b, v14.8h, v18.8h ++ raddhn v3.8b, v15.8h, v19.8h ++ mvn v25.8b, v3.8b /* get inverted alpha */ ++ /* source: v0 - blue, v1 - green, v2 - red, v3 - alpha */ ++ /* destination: v4 - blue, v5 - green, v6 - red, v7 - alpha */ ++ /* now do alpha blending */ ++ umull v12.8h, v25.8b, v4.8b ++ umull v13.8h, v25.8b, v5.8b ++ umull v14.8h, v25.8b, v6.8b ++ umull v15.8h, v25.8b, v7.8b ++.endm ++ ++.macro pixman_composite_over_n_8_8888_process_pixblock_tail ++ urshr v16.8h, v12.8h, #8 ++ urshr v17.8h, v13.8h, #8 ++ urshr v18.8h, v14.8h, #8 ++ urshr v19.8h, v15.8h, #8 ++ raddhn v28.8b, v16.8h, v12.8h ++ raddhn v29.8b, v17.8h, v13.8h ++ raddhn v30.8b, v18.8h, v14.8h ++ raddhn v31.8b, v19.8h, v15.8h ++ uqadd v28.8b, v0.8b, v28.8b ++ uqadd v29.8b, v1.8b, v29.8b ++ uqadd v30.8b, v2.8b, v30.8b ++ uqadd v31.8b, v3.8b, v31.8b ++.endm ++ ++.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head ++ urshr v16.8h, v12.8h, #8 ++ ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 ++ urshr v17.8h, v13.8h, #8 ++ fetch_mask_pixblock ++ urshr v18.8h, v14.8h, #8 ++ PF add PF_X, PF_X, #8 ++ urshr v19.8h, v15.8h, #8 ++ PF tst PF_CTL, #0x0F ++ raddhn v28.8b, v16.8h, v12.8h ++ PF beq 10f ++ PF add PF_X, PF_X, #8 ++10: ++ raddhn v29.8b, v17.8h, v13.8h ++ PF beq 10f ++ PF sub PF_CTL, PF_CTL, #1 ++10: ++ raddhn v30.8b, v18.8h, v14.8h ++ PF cmp PF_X, ORIG_W ++ raddhn v31.8b, v19.8h, v15.8h ++ PF lsl DUMMY, PF_X, #dst_bpp_shift ++ PF prfm PREFETCH_MODE, [PF_DST, DUMMY] ++ umull v16.8h, v24.8b, v8.8b ++ PF lsl DUMMY, PF_X, #mask_bpp_shift ++ PF prfm PREFETCH_MODE, [PF_MASK, DUMMY] ++ umull v17.8h, v24.8b, v9.8b ++ PF ble 10f ++ PF sub PF_X, PF_X, ORIG_W ++10: ++ umull v18.8h, v24.8b, v10.8b ++ PF ble 10f ++ PF subs PF_CTL, PF_CTL, #0x10 ++10: ++ umull v19.8h, v24.8b, v11.8b ++ PF ble 10f ++ PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF ldrsb DUMMY, [PF_DST, DUMMY] ++ PF add PF_DST, PF_DST, #1 ++10: ++ uqadd v28.8b, v0.8b, v28.8b ++ PF ble 10f ++ PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift ++ PF ldrsb DUMMY, [PF_MASK, DUMMY] ++ PF add PF_MASK, PF_MASK, #1 ++10: ++ uqadd v29.8b, v1.8b, v29.8b ++ uqadd v30.8b, v2.8b, v30.8b ++ uqadd v31.8b, v3.8b, v31.8b ++ urshr v12.8h, v16.8h, #8 ++ urshr v13.8h, v17.8h, #8 ++ urshr v14.8h, v18.8h, #8 ++ urshr v15.8h, v19.8h, #8 ++ raddhn v0.8b, v16.8h, v12.8h ++ raddhn v1.8b, v17.8h, v13.8h ++ raddhn v2.8b, v18.8h, v14.8h ++ raddhn v3.8b, v19.8h, v15.8h ++ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 ++ mvn v25.8b, v3.8b ++ umull v12.8h, v25.8b, v4.8b ++ umull v13.8h, v25.8b, v5.8b ++ umull v14.8h, v25.8b, v6.8b ++ umull v15.8h, v25.8b, v7.8b ++.endm ++ ++.macro pixman_composite_over_n_8_8888_init ++ mov v11.s[0], w4 ++ dup v8.8b, v11.b[0] ++ dup v9.8b, v11.b[1] ++ dup v10.8b, v11.b[2] ++ dup v11.8b, v11.b[3] ++.endm ++ ++.macro pixman_composite_over_n_8_8888_cleanup ++.endm ++ ++generate_composite_function \ ++ pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \ ++ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 5, /* prefetch distance */ \ ++ pixman_composite_over_n_8_8888_init, \ ++ pixman_composite_over_n_8_8888_cleanup, \ ++ pixman_composite_over_n_8_8888_process_pixblock_head, \ ++ pixman_composite_over_n_8_8888_process_pixblock_tail, \ ++ pixman_composite_over_n_8_8888_process_pixblock_tail_head ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_over_n_8_8_process_pixblock_head ++ umull v0.8h, v24.8b, v8.8b ++ umull v1.8h, v25.8b, v8.8b ++ umull v2.8h, v26.8b, v8.8b ++ umull v3.8h, v27.8b, v8.8b ++ urshr v10.8h, v0.8h, #8 ++ urshr v11.8h, v1.8h, #8 ++ urshr v12.8h, v2.8h, #8 ++ urshr v13.8h, v3.8h, #8 ++ raddhn v0.8b, v0.8h, v10.8h ++ raddhn v1.8b, v1.8h, v11.8h ++ raddhn v2.8b, v2.8h, v12.8h ++ raddhn v3.8b, v3.8h, v13.8h ++ mvn v24.8b, v0.8b ++ mvn v25.8b, v1.8b ++ mvn v26.8b, v2.8b ++ mvn v27.8b, v3.8b ++ umull v10.8h, v24.8b, v4.8b ++ umull v11.8h, v25.8b, v5.8b ++ umull v12.8h, v26.8b, v6.8b ++ umull v13.8h, v27.8b, v7.8b ++.endm ++ ++.macro pixman_composite_over_n_8_8_process_pixblock_tail ++ urshr v14.8h, v10.8h, #8 ++ urshr v15.8h, v11.8h, #8 ++ urshr v16.8h, v12.8h, #8 ++ urshr v17.8h, v13.8h, #8 ++ raddhn v28.8b, v14.8h, v10.8h ++ raddhn v29.8b, v15.8h, v11.8h ++ raddhn v30.8b, v16.8h, v12.8h ++ raddhn v31.8b, v17.8h, v13.8h ++ uqadd v28.8b, v0.8b, v28.8b ++ uqadd v29.8b, v1.8b, v29.8b ++ uqadd v30.8b, v2.8b, v30.8b ++ uqadd v31.8b, v3.8b, v31.8b ++.endm ++ ++/* TODO: expand macros and do better instructions scheduling */ ++.macro pixman_composite_over_n_8_8_process_pixblock_tail_head ++ ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 ++ pixman_composite_over_n_8_8_process_pixblock_tail ++ fetch_mask_pixblock ++ cache_preload 32, 32 ++ st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 ++ pixman_composite_over_n_8_8_process_pixblock_head ++.endm ++ ++.macro pixman_composite_over_n_8_8_init ++ mov v8.s[0], w4 ++ dup v8.8b, v8.b[3] ++.endm ++ ++.macro pixman_composite_over_n_8_8_cleanup ++.endm ++ ++generate_composite_function \ ++ pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \ ++ FLAG_DST_READWRITE, \ ++ 32, /* number of pixels, processed in a single block */ \ ++ 5, /* prefetch distance */ \ ++ pixman_composite_over_n_8_8_init, \ ++ pixman_composite_over_n_8_8_cleanup, \ ++ pixman_composite_over_n_8_8_process_pixblock_head, \ ++ pixman_composite_over_n_8_8_process_pixblock_tail, \ ++ pixman_composite_over_n_8_8_process_pixblock_tail_head ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head ++ /* ++ * 'combine_mask_ca' replacement ++ * ++ * input: solid src (n) in {v8, v9, v10, v11} ++ * dest in {v4, v5, v6, v7 } ++ * mask in {v24, v25, v26, v27} ++ * output: updated src in {v0, v1, v2, v3 } ++ * updated mask in {v24, v25, v26, v3 } ++ */ ++ umull v0.8h, v24.8b, v8.8b ++ umull v1.8h, v25.8b, v9.8b ++ umull v2.8h, v26.8b, v10.8b ++ umull v3.8h, v27.8b, v11.8b ++ umull v12.8h, v11.8b, v25.8b ++ umull v13.8h, v11.8b, v24.8b ++ umull v14.8h, v11.8b, v26.8b ++ urshr v15.8h, v0.8h, #8 ++ urshr v16.8h, v1.8h, #8 ++ urshr v17.8h, v2.8h, #8 ++ raddhn v0.8b, v0.8h, v15.8h ++ raddhn v1.8b, v1.8h, v16.8h ++ raddhn v2.8b, v2.8h, v17.8h ++ urshr v15.8h, v13.8h, #8 ++ urshr v16.8h, v12.8h, #8 ++ urshr v17.8h, v14.8h, #8 ++ urshr v18.8h, v3.8h, #8 ++ raddhn v24.8b, v13.8h, v15.8h ++ raddhn v25.8b, v12.8h, v16.8h ++ raddhn v26.8b, v14.8h, v17.8h ++ raddhn v3.8b, v3.8h, v18.8h ++ /* ++ * 'combine_over_ca' replacement ++ * ++ * output: updated dest in {v28, v29, v30, v31} ++ */ ++ mvn v24.8b, v24.8b ++ mvn v25.8b, v25.8b ++ mvn v26.8b, v26.8b ++ mvn v27.8b, v3.8b ++ umull v12.8h, v24.8b, v4.8b ++ umull v13.8h, v25.8b, v5.8b ++ umull v14.8h, v26.8b, v6.8b ++ umull v15.8h, v27.8b, v7.8b ++.endm ++ ++.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail ++ /* ... continue 'combine_over_ca' replacement */ ++ urshr v16.8h, v12.8h, #8 ++ urshr v17.8h, v13.8h, #8 ++ urshr v18.8h, v14.8h, #8 ++ urshr v19.8h, v15.8h, #8 ++ raddhn v28.8b, v16.8h, v12.8h ++ raddhn v29.8b, v17.8h, v13.8h ++ raddhn v30.8b, v18.8h, v14.8h ++ raddhn v31.8b, v19.8h, v15.8h ++ uqadd v28.8b, v0.8b, v28.8b ++ uqadd v29.8b, v1.8b, v29.8b ++ uqadd v30.8b, v2.8b, v30.8b ++ uqadd v31.8b, v3.8b, v31.8b ++.endm ++ ++.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head ++ urshr v16.8h, v12.8h, #8 ++ urshr v17.8h, v13.8h, #8 ++ ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 ++ urshr v18.8h, v14.8h, #8 ++ urshr v19.8h, v15.8h, #8 ++ raddhn v28.8b, v16.8h, v12.8h ++ raddhn v29.8b, v17.8h, v13.8h ++ raddhn v30.8b, v18.8h, v14.8h ++ raddhn v31.8b, v19.8h, v15.8h ++ fetch_mask_pixblock ++ uqadd v28.8b, v0.8b, v28.8b ++ uqadd v29.8b, v1.8b, v29.8b ++ uqadd v30.8b, v2.8b, v30.8b ++ uqadd v31.8b, v3.8b, v31.8b ++ cache_preload 8, 8 ++ pixman_composite_over_n_8888_8888_ca_process_pixblock_head ++ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 ++.endm ++ ++.macro pixman_composite_over_n_8888_8888_ca_init ++ mov v13.s[0], w4 ++ dup v8.8b, v13.b[0] ++ dup v9.8b, v13.b[1] ++ dup v10.8b, v13.b[2] ++ dup v11.8b, v13.b[3] ++.endm ++ ++.macro pixman_composite_over_n_8888_8888_ca_cleanup ++.endm ++ ++generate_composite_function \ ++ pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \ ++ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 5, /* prefetch distance */ \ ++ pixman_composite_over_n_8888_8888_ca_init, \ ++ pixman_composite_over_n_8888_8888_ca_cleanup, \ ++ pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \ ++ pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \ ++ pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head ++ /* ++ * 'combine_mask_ca' replacement ++ * ++ * input: solid src (n) in {v8, v9, v10, v11} [B, G, R, A] ++ * mask in {v24, v25, v26} [B, G, R] ++ * output: updated src in {v0, v1, v2 } [B, G, R] ++ * updated mask in {v24, v25, v26} [B, G, R] ++ */ ++ umull v0.8h, v24.8b, v8.8b ++ umull v1.8h, v25.8b, v9.8b ++ umull v2.8h, v26.8b, v10.8b ++ umull v12.8h, v11.8b, v24.8b ++ umull v13.8h, v11.8b, v25.8b ++ umull v14.8h, v11.8b, v26.8b ++ urshr v15.8h, v0.8h, #8 ++ urshr v16.8h, v1.8h, #8 ++ urshr v17.8h, v2.8h, #8 ++ raddhn v0.8b, v0.8h, v15.8h ++ raddhn v1.8b, v1.8h, v16.8h ++ raddhn v2.8b, v2.8h, v17.8h ++ urshr v19.8h, v12.8h, #8 ++ urshr v20.8h, v13.8h, #8 ++ urshr v21.8h, v14.8h, #8 ++ raddhn v24.8b, v12.8h, v19.8h ++ raddhn v25.8b, v13.8h, v20.8h ++ /* ++ * convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format ++ * and put data into v16 - blue, v17 - green, v18 - red ++ */ ++ mov v4.d[1], v5.d[0] ++ shrn v17.8b, v4.8h, #3 ++ shrn v18.8b, v4.8h, #8 ++ raddhn v26.8b, v14.8h, v21.8h ++ sli v4.8h, v4.8h, #5 ++ sri v18.8b, v18.8b, #5 ++ sri v17.8b, v17.8b, #6 ++ /* ++ * 'combine_over_ca' replacement ++ * ++ * output: updated dest in v16 - blue, v17 - green, v18 - red ++ */ ++ mvn v24.8b, v24.8b ++ mvn v25.8b, v25.8b ++ shrn v16.8b, v4.8h, #2 ++ mvn v26.8b, v26.8b ++ umull v5.8h, v16.8b, v24.8b ++ umull v6.8h, v17.8b, v25.8b ++ umull v7.8h, v18.8b, v26.8b ++.endm ++ ++.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail ++ /* ... continue 'combine_over_ca' replacement */ ++ urshr v13.8h, v5.8h, #8 ++ urshr v14.8h, v6.8h, #8 ++ urshr v15.8h, v7.8h, #8 ++ raddhn v16.8b, v13.8h, v5.8h ++ raddhn v17.8b, v14.8h, v6.8h ++ raddhn v18.8b, v15.8h, v7.8h ++ uqadd v16.8b, v0.8b, v16.8b ++ uqadd v17.8b, v1.8b, v17.8b ++ uqadd v18.8b, v2.8b, v18.8b ++ /* ++ * convert the results in v16, v17, v18 to r5g6b5 and store ++ * them into {v14} ++ */ ++ ushll v14.8h, v18.8b, #7 ++ sli v14.8h, v14.8h, #1 ++ ushll v12.8h, v17.8b, #7 ++ sli v12.8h, v12.8h, #1 ++ ushll v13.8h, v16.8b, #7 ++ sli v13.8h, v13.8h, #1 ++ sri v14.8h, v12.8h, #5 ++ sri v14.8h, v13.8h, #11 ++ mov v28.d[0], v14.d[0] ++ mov v29.d[0], v14.d[1] ++.endm ++ ++.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head ++ fetch_mask_pixblock ++ urshr v13.8h, v5.8h, #8 ++ urshr v14.8h, v6.8h, #8 ++ ld1 {v4.8h}, [DST_R], #16 ++ urshr v15.8h, v7.8h, #8 ++ raddhn v16.8b, v13.8h, v5.8h ++ raddhn v17.8b, v14.8h, v6.8h ++ raddhn v18.8b, v15.8h, v7.8h ++ mov v5.d[0], v4.d[1] ++ /* process_pixblock_head */ ++ /* ++ * 'combine_mask_ca' replacement ++ * ++ * input: solid src (n) in {v8, v9, v10, v11} [B, G, R, A] ++ * mask in {v24, v25, v26} [B, G, R] ++ * output: updated src in {v0, v1, v2 } [B, G, R] ++ * updated mask in {v24, v25, v26} [B, G, R] ++ */ ++ uqadd v16.8b, v0.8b, v16.8b ++ uqadd v17.8b, v1.8b, v17.8b ++ uqadd v18.8b, v2.8b, v18.8b ++ umull v0.8h, v24.8b, v8.8b ++ umull v1.8h, v25.8b, v9.8b ++ umull v2.8h, v26.8b, v10.8b ++ /* ++ * convert the result in v16, v17, v18 to r5g6b5 and store ++ * it into {v14} ++ */ ++ ushll v14.8h, v18.8b, #7 ++ sli v14.8h, v14.8h, #1 ++ ushll v18.8h, v16.8b, #7 ++ sli v18.8h, v18.8h, #1 ++ ushll v19.8h, v17.8b, #7 ++ sli v19.8h, v19.8h, #1 ++ umull v12.8h, v11.8b, v24.8b ++ sri v14.8h, v19.8h, #5 ++ umull v13.8h, v11.8b, v25.8b ++ umull v15.8h, v11.8b, v26.8b ++ sri v14.8h, v18.8h, #11 ++ mov v28.d[0], v14.d[0] ++ mov v29.d[0], v14.d[1] ++ cache_preload 8, 8 ++ urshr v16.8h, v0.8h, #8 ++ urshr v17.8h, v1.8h, #8 ++ urshr v18.8h, v2.8h, #8 ++ raddhn v0.8b, v0.8h, v16.8h ++ raddhn v1.8b, v1.8h, v17.8h ++ raddhn v2.8b, v2.8h, v18.8h ++ urshr v19.8h, v12.8h, #8 ++ urshr v20.8h, v13.8h, #8 ++ urshr v21.8h, v15.8h, #8 ++ raddhn v24.8b, v12.8h, v19.8h ++ raddhn v25.8b, v13.8h, v20.8h ++ /* ++ * convert 8 r5g6b5 pixel data from {v4, v5} to planar ++ * 8-bit format and put data into v16 - blue, v17 - green, ++ * v18 - red ++ */ ++ mov v4.d[1], v5.d[0] ++ shrn v17.8b, v4.8h, #3 ++ shrn v18.8b, v4.8h, #8 ++ raddhn v26.8b, v15.8h, v21.8h ++ sli v4.8h, v4.8h, #5 ++ sri v17.8b, v17.8b, #6 ++ sri v18.8b, v18.8b, #5 ++ /* ++ * 'combine_over_ca' replacement ++ * ++ * output: updated dest in v16 - blue, v17 - green, v18 - red ++ */ ++ mvn v24.8b, v24.8b ++ mvn v25.8b, v25.8b ++ shrn v16.8b, v4.8h, #2 ++ mvn v26.8b, v26.8b ++ umull v5.8h, v16.8b, v24.8b ++ umull v6.8h, v17.8b, v25.8b ++ umull v7.8h, v18.8b, v26.8b ++ st1 {v14.8h}, [DST_W], #16 ++.endm ++ ++.macro pixman_composite_over_n_8888_0565_ca_init ++ mov v13.s[0], w4 ++ dup v8.8b, v13.b[0] ++ dup v9.8b, v13.b[1] ++ dup v10.8b, v13.b[2] ++ dup v11.8b, v13.b[3] ++.endm ++ ++.macro pixman_composite_over_n_8888_0565_ca_cleanup ++.endm ++ ++generate_composite_function \ ++ pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \ ++ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 5, /* prefetch distance */ \ ++ pixman_composite_over_n_8888_0565_ca_init, \ ++ pixman_composite_over_n_8888_0565_ca_cleanup, \ ++ pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \ ++ pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \ ++ pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_in_n_8_process_pixblock_head ++ /* expecting source data in {v0, v1, v2, v3} */ ++ /* and destination data in {v4, v5, v6, v7} */ ++ umull v8.8h, v4.8b, v3.8b ++ umull v9.8h, v5.8b, v3.8b ++ umull v10.8h, v6.8b, v3.8b ++ umull v11.8h, v7.8b, v3.8b ++.endm ++ ++.macro pixman_composite_in_n_8_process_pixblock_tail ++ urshr v14.8h, v8.8h, #8 ++ urshr v15.8h, v9.8h, #8 ++ urshr v12.8h, v10.8h, #8 ++ urshr v13.8h, v11.8h, #8 ++ raddhn v28.8b, v8.8h, v14.8h ++ raddhn v29.8b, v9.8h, v15.8h ++ raddhn v30.8b, v10.8h, v12.8h ++ raddhn v31.8b, v11.8h, v13.8h ++.endm ++ ++.macro pixman_composite_in_n_8_process_pixblock_tail_head ++ pixman_composite_in_n_8_process_pixblock_tail ++ ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 ++ cache_preload 32, 32 ++ pixman_composite_in_n_8_process_pixblock_head ++ st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 ++.endm ++ ++.macro pixman_composite_in_n_8_init ++ mov v3.s[0], w4 ++ dup v3.8b, v3.b[3] ++.endm ++ ++.macro pixman_composite_in_n_8_cleanup ++.endm ++ ++generate_composite_function \ ++ pixman_composite_in_n_8_asm_neon, 0, 0, 8, \ ++ FLAG_DST_READWRITE, \ ++ 32, /* number of pixels, processed in a single block */ \ ++ 5, /* prefetch distance */ \ ++ pixman_composite_in_n_8_init, \ ++ pixman_composite_in_n_8_cleanup, \ ++ pixman_composite_in_n_8_process_pixblock_head, \ ++ pixman_composite_in_n_8_process_pixblock_tail, \ ++ pixman_composite_in_n_8_process_pixblock_tail_head, \ ++ 28, /* dst_w_basereg */ \ ++ 4, /* dst_r_basereg */ \ ++ 0, /* src_basereg */ \ ++ 24 /* mask_basereg */ ++ ++.macro pixman_composite_add_n_8_8_process_pixblock_head ++ /* expecting source data in {v8, v9, v10, v11} */ ++ /* v8 - blue, v9 - green, v10 - red, v11 - alpha */ ++ /* and destination data in {v4, v5, v6, v7} */ ++ /* mask is in v24, v25, v26, v27 */ ++ umull v0.8h, v24.8b, v11.8b ++ umull v1.8h, v25.8b, v11.8b ++ umull v2.8h, v26.8b, v11.8b ++ umull v3.8h, v27.8b, v11.8b ++ urshr v12.8h, v0.8h, #8 ++ urshr v13.8h, v1.8h, #8 ++ urshr v14.8h, v2.8h, #8 ++ urshr v15.8h, v3.8h, #8 ++ raddhn v0.8b, v0.8h, v12.8h ++ raddhn v1.8b, v1.8h, v13.8h ++ raddhn v2.8b, v2.8h, v14.8h ++ raddhn v3.8b, v3.8h, v15.8h ++ uqadd v28.8b, v0.8b, v4.8b ++ uqadd v29.8b, v1.8b, v5.8b ++ uqadd v30.8b, v2.8b, v6.8b ++ uqadd v31.8b, v3.8b, v7.8b ++.endm ++ ++.macro pixman_composite_add_n_8_8_process_pixblock_tail ++.endm ++ ++/* TODO: expand macros and do better instructions scheduling */ ++.macro pixman_composite_add_n_8_8_process_pixblock_tail_head ++ pixman_composite_add_n_8_8_process_pixblock_tail ++ st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 ++ ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 ++ fetch_mask_pixblock ++ cache_preload 32, 32 ++ pixman_composite_add_n_8_8_process_pixblock_head ++.endm ++ ++.macro pixman_composite_add_n_8_8_init ++ mov v11.s[0], w4 ++ dup v11.8b, v11.b[3] ++.endm ++ ++.macro pixman_composite_add_n_8_8_cleanup ++.endm ++ ++generate_composite_function \ ++ pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \ ++ FLAG_DST_READWRITE, \ ++ 32, /* number of pixels, processed in a single block */ \ ++ 5, /* prefetch distance */ \ ++ pixman_composite_add_n_8_8_init, \ ++ pixman_composite_add_n_8_8_cleanup, \ ++ pixman_composite_add_n_8_8_process_pixblock_head, \ ++ pixman_composite_add_n_8_8_process_pixblock_tail, \ ++ pixman_composite_add_n_8_8_process_pixblock_tail_head ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_add_8_8_8_process_pixblock_head ++ /* expecting source data in {v0, v1, v2, v3} */ ++ /* destination data in {v4, v5, v6, v7} */ ++ /* mask in {v24, v25, v26, v27} */ ++ umull v8.8h, v24.8b, v0.8b ++ umull v9.8h, v25.8b, v1.8b ++ umull v10.8h, v26.8b, v2.8b ++ umull v11.8h, v27.8b, v3.8b ++ urshr v0.8h, v8.8h, #8 ++ urshr v1.8h, v9.8h, #8 ++ urshr v12.8h, v10.8h, #8 ++ urshr v13.8h, v11.8h, #8 ++ raddhn v0.8b, v0.8h, v8.8h ++ raddhn v1.8b, v1.8h, v9.8h ++ raddhn v2.8b, v12.8h, v10.8h ++ raddhn v3.8b, v13.8h, v11.8h ++ uqadd v28.8b, v0.8b, v4.8b ++ uqadd v29.8b, v1.8b, v5.8b ++ uqadd v30.8b, v2.8b, v6.8b ++ uqadd v31.8b, v3.8b, v7.8b ++.endm ++ ++.macro pixman_composite_add_8_8_8_process_pixblock_tail ++.endm ++ ++/* TODO: expand macros and do better instructions scheduling */ ++.macro pixman_composite_add_8_8_8_process_pixblock_tail_head ++ pixman_composite_add_8_8_8_process_pixblock_tail ++ st1 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 ++ ld1 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 ++ fetch_mask_pixblock ++ fetch_src_pixblock ++ cache_preload 32, 32 ++ pixman_composite_add_8_8_8_process_pixblock_head ++.endm ++ ++.macro pixman_composite_add_8_8_8_init ++.endm ++ ++.macro pixman_composite_add_8_8_8_cleanup ++.endm ++ ++generate_composite_function \ ++ pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \ ++ FLAG_DST_READWRITE, \ ++ 32, /* number of pixels, processed in a single block */ \ ++ 5, /* prefetch distance */ \ ++ pixman_composite_add_8_8_8_init, \ ++ pixman_composite_add_8_8_8_cleanup, \ ++ pixman_composite_add_8_8_8_process_pixblock_head, \ ++ pixman_composite_add_8_8_8_process_pixblock_tail, \ ++ pixman_composite_add_8_8_8_process_pixblock_tail_head ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_add_8888_8888_8888_process_pixblock_head ++ /* expecting source data in {v0, v1, v2, v3} */ ++ /* destination data in {v4, v5, v6, v7} */ ++ /* mask in {v24, v25, v26, v27} */ ++ umull v8.8h, v27.8b, v0.8b ++ umull v9.8h, v27.8b, v1.8b ++ umull v10.8h, v27.8b, v2.8b ++ umull v11.8h, v27.8b, v3.8b ++ /* 1 cycle bubble */ ++ ursra v8.8h, v8.8h, #8 ++ ursra v9.8h, v9.8h, #8 ++ ursra v10.8h, v10.8h, #8 ++ ursra v11.8h, v11.8h, #8 ++.endm ++ ++.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail ++ /* 2 cycle bubble */ ++ rshrn v28.8b, v8.8h, #8 ++ rshrn v29.8b, v9.8h, #8 ++ rshrn v30.8b, v10.8h, #8 ++ rshrn v31.8b, v11.8h, #8 ++ uqadd v28.8b, v4.8b, v28.8b ++ uqadd v29.8b, v5.8b, v29.8b ++ uqadd v30.8b, v6.8b, v30.8b ++ uqadd v31.8b, v7.8b, v31.8b ++.endm ++ ++.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head ++ fetch_src_pixblock ++ rshrn v28.8b, v8.8h, #8 ++ fetch_mask_pixblock ++ rshrn v29.8b, v9.8h, #8 ++ umull v8.8h, v27.8b, v0.8b ++ rshrn v30.8b, v10.8h, #8 ++ umull v9.8h, v27.8b, v1.8b ++ rshrn v31.8b, v11.8h, #8 ++ umull v10.8h, v27.8b, v2.8b ++ umull v11.8h, v27.8b, v3.8b ++ uqadd v28.8b, v4.8b, v28.8b ++ uqadd v29.8b, v5.8b, v29.8b ++ uqadd v30.8b, v6.8b, v30.8b ++ uqadd v31.8b, v7.8b, v31.8b ++ ursra v8.8h, v8.8h, #8 ++ ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 ++ ursra v9.8h, v9.8h, #8 ++ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 ++ ursra v10.8h, v10.8h, #8 ++ ++ cache_preload 8, 8 ++ ++ ursra v11.8h, v11.8h, #8 ++.endm ++ ++generate_composite_function \ ++ pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \ ++ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 10, /* prefetch distance */ \ ++ default_init, \ ++ default_cleanup, \ ++ pixman_composite_add_8888_8888_8888_process_pixblock_head, \ ++ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ ++ pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ ++ 28, /* dst_w_basereg */ \ ++ 4, /* dst_r_basereg */ \ ++ 0, /* src_basereg */ \ ++ 24 /* mask_basereg */ ++ ++generate_composite_function_single_scanline \ ++ pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \ ++ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ default_init, \ ++ default_cleanup, \ ++ pixman_composite_add_8888_8888_8888_process_pixblock_head, \ ++ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ ++ pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ ++ 28, /* dst_w_basereg */ \ ++ 4, /* dst_r_basereg */ \ ++ 0, /* src_basereg */ \ ++ 24 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++generate_composite_function \ ++ pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \ ++ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 5, /* prefetch distance */ \ ++ default_init, \ ++ default_cleanup, \ ++ pixman_composite_add_8888_8888_8888_process_pixblock_head, \ ++ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ ++ pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ ++ 28, /* dst_w_basereg */ \ ++ 4, /* dst_r_basereg */ \ ++ 0, /* src_basereg */ \ ++ 27 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_add_n_8_8888_init ++ mov v3.s[0], w4 ++ dup v0.8b, v3.b[0] ++ dup v1.8b, v3.b[1] ++ dup v2.8b, v3.b[2] ++ dup v3.8b, v3.b[3] ++.endm ++ ++.macro pixman_composite_add_n_8_8888_cleanup ++.endm ++ ++generate_composite_function \ ++ pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \ ++ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 5, /* prefetch distance */ \ ++ pixman_composite_add_n_8_8888_init, \ ++ pixman_composite_add_n_8_8888_cleanup, \ ++ pixman_composite_add_8888_8888_8888_process_pixblock_head, \ ++ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ ++ pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ ++ 28, /* dst_w_basereg */ \ ++ 4, /* dst_r_basereg */ \ ++ 0, /* src_basereg */ \ ++ 27 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_add_8888_n_8888_init ++ mov v27.s[0], w6 ++ dup v27.8b, v27.b[3] ++.endm ++ ++.macro pixman_composite_add_8888_n_8888_cleanup ++.endm ++ ++generate_composite_function \ ++ pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \ ++ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 5, /* prefetch distance */ \ ++ pixman_composite_add_8888_n_8888_init, \ ++ pixman_composite_add_8888_n_8888_cleanup, \ ++ pixman_composite_add_8888_8888_8888_process_pixblock_head, \ ++ pixman_composite_add_8888_8888_8888_process_pixblock_tail, \ ++ pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \ ++ 28, /* dst_w_basereg */ \ ++ 4, /* dst_r_basereg */ \ ++ 0, /* src_basereg */ \ ++ 27 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head ++ /* expecting source data in {v0, v1, v2, v3} */ ++ /* destination data in {v4, v5, v6, v7} */ ++ /* solid mask is in v15 */ ++ ++ /* 'in' */ ++ umull v11.8h, v15.8b, v3.8b ++ umull v10.8h, v15.8b, v2.8b ++ umull v9.8h, v15.8b, v1.8b ++ umull v8.8h, v15.8b, v0.8b ++ urshr v16.8h, v11.8h, #8 ++ urshr v14.8h, v10.8h, #8 ++ urshr v13.8h, v9.8h, #8 ++ urshr v12.8h, v8.8h, #8 ++ raddhn v3.8b, v11.8h, v16.8h ++ raddhn v2.8b, v10.8h, v14.8h ++ raddhn v1.8b, v9.8h, v13.8h ++ raddhn v0.8b, v8.8h, v12.8h ++ mvn v24.8b, v3.8b /* get inverted alpha */ ++ /* now do alpha blending */ ++ umull v8.8h, v24.8b, v4.8b ++ umull v9.8h, v24.8b, v5.8b ++ umull v10.8h, v24.8b, v6.8b ++ umull v11.8h, v24.8b, v7.8b ++.endm ++ ++.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail ++ urshr v16.8h, v8.8h, #8 ++ urshr v17.8h, v9.8h, #8 ++ urshr v18.8h, v10.8h, #8 ++ urshr v19.8h, v11.8h, #8 ++ raddhn v28.8b, v16.8h, v8.8h ++ raddhn v29.8b, v17.8h, v9.8h ++ raddhn v30.8b, v18.8h, v10.8h ++ raddhn v31.8b, v19.8h, v11.8h ++.endm ++ ++/* TODO: expand macros and do better instructions scheduling */ ++.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head ++ ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 ++ pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail ++ fetch_src_pixblock ++ cache_preload 8, 8 ++ fetch_mask_pixblock ++ pixman_composite_out_reverse_8888_n_8888_process_pixblock_head ++ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 ++.endm ++ ++generate_composite_function_single_scanline \ ++ pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \ ++ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ default_init_need_all_regs, \ ++ default_cleanup_need_all_regs, \ ++ pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \ ++ pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \ ++ pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \ ++ 28, /* dst_w_basereg */ \ ++ 4, /* dst_r_basereg */ \ ++ 0, /* src_basereg */ \ ++ 12 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_over_8888_n_8888_process_pixblock_head ++ pixman_composite_out_reverse_8888_n_8888_process_pixblock_head ++.endm ++ ++.macro pixman_composite_over_8888_n_8888_process_pixblock_tail ++ pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail ++ uqadd v28.8b, v0.8b, v28.8b ++ uqadd v29.8b, v1.8b, v29.8b ++ uqadd v30.8b, v2.8b, v30.8b ++ uqadd v31.8b, v3.8b, v31.8b ++.endm ++ ++/* TODO: expand macros and do better instructions scheduling */ ++.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head ++ ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 ++ pixman_composite_over_8888_n_8888_process_pixblock_tail ++ fetch_src_pixblock ++ cache_preload 8, 8 ++ pixman_composite_over_8888_n_8888_process_pixblock_head ++ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 ++.endm ++ ++.macro pixman_composite_over_8888_n_8888_init ++ mov v15.s[0], w6 ++ dup v15.8b, v15.b[3] ++.endm ++ ++.macro pixman_composite_over_8888_n_8888_cleanup ++.endm ++ ++generate_composite_function \ ++ pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \ ++ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 5, /* prefetch distance */ \ ++ pixman_composite_over_8888_n_8888_init, \ ++ pixman_composite_over_8888_n_8888_cleanup, \ ++ pixman_composite_over_8888_n_8888_process_pixblock_head, \ ++ pixman_composite_over_8888_n_8888_process_pixblock_tail, \ ++ pixman_composite_over_8888_n_8888_process_pixblock_tail_head, \ ++ 28, /* dst_w_basereg */ \ ++ 4, /* dst_r_basereg */ \ ++ 0, /* src_basereg */ \ ++ 12 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++/* TODO: expand macros and do better instructions scheduling */ ++.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head ++ ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 ++ pixman_composite_over_8888_n_8888_process_pixblock_tail ++ fetch_src_pixblock ++ cache_preload 8, 8 ++ fetch_mask_pixblock ++ pixman_composite_over_8888_n_8888_process_pixblock_head ++ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 ++.endm ++ ++generate_composite_function \ ++ pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \ ++ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 5, /* prefetch distance */ \ ++ default_init_need_all_regs, \ ++ default_cleanup_need_all_regs, \ ++ pixman_composite_over_8888_n_8888_process_pixblock_head, \ ++ pixman_composite_over_8888_n_8888_process_pixblock_tail, \ ++ pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ ++ 28, /* dst_w_basereg */ \ ++ 4, /* dst_r_basereg */ \ ++ 0, /* src_basereg */ \ ++ 12 /* mask_basereg */ ++ ++generate_composite_function_single_scanline \ ++ pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \ ++ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ default_init_need_all_regs, \ ++ default_cleanup_need_all_regs, \ ++ pixman_composite_over_8888_n_8888_process_pixblock_head, \ ++ pixman_composite_over_8888_n_8888_process_pixblock_tail, \ ++ pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \ ++ 28, /* dst_w_basereg */ \ ++ 4, /* dst_r_basereg */ \ ++ 0, /* src_basereg */ \ ++ 12 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++/* TODO: expand macros and do better instructions scheduling */ ++.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head ++ ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 ++ pixman_composite_over_8888_n_8888_process_pixblock_tail ++ fetch_src_pixblock ++ cache_preload 8, 8 ++ fetch_mask_pixblock ++ pixman_composite_over_8888_n_8888_process_pixblock_head ++ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 ++.endm ++ ++generate_composite_function \ ++ pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \ ++ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 5, /* prefetch distance */ \ ++ default_init_need_all_regs, \ ++ default_cleanup_need_all_regs, \ ++ pixman_composite_over_8888_n_8888_process_pixblock_head, \ ++ pixman_composite_over_8888_n_8888_process_pixblock_tail, \ ++ pixman_composite_over_8888_8_8888_process_pixblock_tail_head \ ++ 28, /* dst_w_basereg */ \ ++ 4, /* dst_r_basereg */ \ ++ 0, /* src_basereg */ \ ++ 15 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_src_0888_0888_process_pixblock_head ++.endm ++ ++.macro pixman_composite_src_0888_0888_process_pixblock_tail ++.endm ++ ++.macro pixman_composite_src_0888_0888_process_pixblock_tail_head ++ st3 {v0.8b, v1.8b, v2.8b}, [DST_W], #24 ++ fetch_src_pixblock ++ cache_preload 8, 8 ++.endm ++ ++generate_composite_function \ ++ pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \ ++ FLAG_DST_WRITEONLY, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 10, /* prefetch distance */ \ ++ default_init, \ ++ default_cleanup, \ ++ pixman_composite_src_0888_0888_process_pixblock_head, \ ++ pixman_composite_src_0888_0888_process_pixblock_tail, \ ++ pixman_composite_src_0888_0888_process_pixblock_tail_head, \ ++ 0, /* dst_w_basereg */ \ ++ 0, /* dst_r_basereg */ \ ++ 0, /* src_basereg */ \ ++ 0 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_src_0888_8888_rev_process_pixblock_head ++ mov v31.8b, v2.8b ++ mov v2.8b, v0.8b ++ mov v0.8b, v31.8b ++.endm ++ ++.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail ++.endm ++ ++.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head ++ st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_W], #32 ++ fetch_src_pixblock ++ mov v31.8b, v2.8b ++ mov v2.8b, v0.8b ++ mov v0.8b, v31.8b ++ cache_preload 8, 8 ++.endm ++ ++.macro pixman_composite_src_0888_8888_rev_init ++ eor v3.8b, v3.8b, v3.8b ++.endm ++ ++generate_composite_function \ ++ pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \ ++ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 10, /* prefetch distance */ \ ++ pixman_composite_src_0888_8888_rev_init, \ ++ default_cleanup, \ ++ pixman_composite_src_0888_8888_rev_process_pixblock_head, \ ++ pixman_composite_src_0888_8888_rev_process_pixblock_tail, \ ++ pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \ ++ 0, /* dst_w_basereg */ \ ++ 0, /* dst_r_basereg */ \ ++ 0, /* src_basereg */ \ ++ 0 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_src_0888_0565_rev_process_pixblock_head ++ ushll v8.8h, v1.8b, #7 ++ sli v8.8h, v8.8h, #1 ++ ushll v9.8h, v2.8b, #7 ++ sli v9.8h, v9.8h, #1 ++.endm ++ ++.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail ++ ushll v14.8h, v0.8b, #7 ++ sli v14.8h, v14.8h, #1 ++ sri v14.8h, v8.8h, #5 ++ sri v14.8h, v9.8h, #11 ++ mov v28.d[0], v14.d[0] ++ mov v29.d[0], v14.d[1] ++.endm ++ ++.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head ++ ushll v14.8h, v0.8b, #7 ++ sli v14.8h, v14.8h, #1 ++ fetch_src_pixblock ++ sri v14.8h, v8.8h, #5 ++ sri v14.8h, v9.8h, #11 ++ mov v28.d[0], v14.d[0] ++ mov v29.d[0], v14.d[1] ++ ushll v8.8h, v1.8b, #7 ++ sli v8.8h, v8.8h, #1 ++ st1 {v14.8h}, [DST_W], #16 ++ ushll v9.8h, v2.8b, #7 ++ sli v9.8h, v9.8h, #1 ++.endm ++ ++generate_composite_function \ ++ pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \ ++ FLAG_DST_WRITEONLY, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 10, /* prefetch distance */ \ ++ default_init, \ ++ default_cleanup, \ ++ pixman_composite_src_0888_0565_rev_process_pixblock_head, \ ++ pixman_composite_src_0888_0565_rev_process_pixblock_tail, \ ++ pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \ ++ 28, /* dst_w_basereg */ \ ++ 0, /* dst_r_basereg */ \ ++ 0, /* src_basereg */ \ ++ 0 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_src_pixbuf_8888_process_pixblock_head ++ umull v8.8h, v3.8b, v0.8b ++ umull v9.8h, v3.8b, v1.8b ++ umull v10.8h, v3.8b, v2.8b ++.endm ++ ++.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail ++ urshr v11.8h, v8.8h, #8 ++ mov v30.8b, v31.8b ++ mov v31.8b, v3.8b ++ mov v3.8b, v30.8b ++ urshr v12.8h, v9.8h, #8 ++ urshr v13.8h, v10.8h, #8 ++ raddhn v30.8b, v11.8h, v8.8h ++ raddhn v29.8b, v12.8h, v9.8h ++ raddhn v28.8b, v13.8h, v10.8h ++.endm ++ ++.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head ++ urshr v11.8h, v8.8h, #8 ++ mov v30.8b, v31.8b ++ mov v31.8b, v3.8b ++ mov v3.8b, v31.8b ++ urshr v12.8h, v9.8h, #8 ++ urshr v13.8h, v10.8h, #8 ++ fetch_src_pixblock ++ raddhn v30.8b, v11.8h, v8.8h ++ PF add PF_X, PF_X, #8 ++ PF tst PF_CTL, #0xF ++ PF beq 10f ++ PF add PF_X, PF_X, #8 ++ PF sub PF_CTL, PF_CTL, #1 ++10: ++ raddhn v29.8b, v12.8h, v9.8h ++ raddhn v28.8b, v13.8h, v10.8h ++ umull v8.8h, v3.8b, v0.8b ++ umull v9.8h, v3.8b, v1.8b ++ umull v10.8h, v3.8b, v2.8b ++ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 ++ PF cmp PF_X, ORIG_W ++ PF lsl DUMMY, PF_X, src_bpp_shift ++ PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] ++ PF ble 10f ++ PF sub PF_X, PF_X, ORIG_W ++ PF subs PF_CTL, PF_CTL, #0x10 ++ PF ble 10f ++ PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF ldrsb DUMMY, [PF_SRC, DUMMY] ++ PF add PF_SRC, PF_SRC, #1 ++10: ++.endm ++ ++generate_composite_function \ ++ pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \ ++ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 10, /* prefetch distance */ \ ++ default_init, \ ++ default_cleanup, \ ++ pixman_composite_src_pixbuf_8888_process_pixblock_head, \ ++ pixman_composite_src_pixbuf_8888_process_pixblock_tail, \ ++ pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \ ++ 28, /* dst_w_basereg */ \ ++ 0, /* dst_r_basereg */ \ ++ 0, /* src_basereg */ \ ++ 0 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head ++ umull v8.8h, v3.8b, v0.8b ++ umull v9.8h, v3.8b, v1.8b ++ umull v10.8h, v3.8b, v2.8b ++.endm ++ ++.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail ++ urshr v11.8h, v8.8h, #8 ++ mov v30.8b, v31.8b ++ mov v31.8b, v3.8b ++ mov v3.8b, v30.8b ++ urshr v12.8h, v9.8h, #8 ++ urshr v13.8h, v10.8h, #8 ++ raddhn v28.8b, v11.8h, v8.8h ++ raddhn v29.8b, v12.8h, v9.8h ++ raddhn v30.8b, v13.8h, v10.8h ++.endm ++ ++.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head ++ urshr v11.8h, v8.8h, #8 ++ mov v30.8b, v31.8b ++ mov v31.8b, v3.8b ++ mov v3.8b, v30.8b ++ urshr v12.8h, v9.8h, #8 ++ urshr v13.8h, v10.8h, #8 ++ fetch_src_pixblock ++ raddhn v28.8b, v11.8h, v8.8h ++ PF add PF_X, PF_X, #8 ++ PF tst PF_CTL, #0xF ++ PF beq 10f ++ PF add PF_X, PF_X, #8 ++ PF sub PF_CTL, PF_CTL, #1 ++10: ++ raddhn v29.8b, v12.8h, v9.8h ++ raddhn v30.8b, v13.8h, v10.8h ++ umull v8.8h, v3.8b, v0.8b ++ umull v9.8h, v3.8b, v1.8b ++ umull v10.8h, v3.8b, v2.8b ++ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 ++ PF cmp PF_X, ORIG_W ++ PF lsl DUMMY, PF_X, src_bpp_shift ++ PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] ++ PF ble 10f ++ PF sub PF_X, PF_X, ORIG_W ++ PF subs PF_CTL, PF_CTL, #0x10 ++ PF ble 10f ++ PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF ldrsb DUMMY, [PF_SRC, DUMMY] ++ PF add PF_SRC, PF_SRC, #1 ++10: ++.endm ++ ++generate_composite_function \ ++ pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \ ++ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 10, /* prefetch distance */ \ ++ default_init, \ ++ default_cleanup, \ ++ pixman_composite_src_rpixbuf_8888_process_pixblock_head, \ ++ pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \ ++ pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \ ++ 28, /* dst_w_basereg */ \ ++ 0, /* dst_r_basereg */ \ ++ 0, /* src_basereg */ \ ++ 0 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_over_0565_8_0565_process_pixblock_head ++ /* mask is in v15 */ ++ mov v4.d[0], v8.d[0] ++ mov v4.d[1], v9.d[0] ++ mov v13.d[0], v10.d[0] ++ mov v13.d[1], v11.d[0] ++ convert_0565_to_x888 v4, v2, v1, v0 ++ convert_0565_to_x888 v13, v6, v5, v4 ++ /* source pixel data is in {v0, v1, v2, XX} */ ++ /* destination pixel data is in {v4, v5, v6, XX} */ ++ mvn v7.8b, v15.8b ++ umull v10.8h, v15.8b, v2.8b ++ umull v9.8h, v15.8b, v1.8b ++ umull v8.8h, v15.8b, v0.8b ++ umull v11.8h, v7.8b, v4.8b ++ umull v12.8h, v7.8b, v5.8b ++ umull v13.8h, v7.8b, v6.8b ++ urshr v19.8h, v10.8h, #8 ++ urshr v18.8h, v9.8h, #8 ++ urshr v17.8h, v8.8h, #8 ++ raddhn v2.8b, v10.8h, v19.8h ++ raddhn v1.8b, v9.8h, v18.8h ++ raddhn v0.8b, v8.8h, v17.8h ++.endm ++ ++.macro pixman_composite_over_0565_8_0565_process_pixblock_tail ++ urshr v17.8h, v11.8h, #8 ++ urshr v18.8h, v12.8h, #8 ++ urshr v19.8h, v13.8h, #8 ++ raddhn v28.8b, v17.8h, v11.8h ++ raddhn v29.8b, v18.8h, v12.8h ++ raddhn v30.8b, v19.8h, v13.8h ++ uqadd v0.8b, v0.8b, v28.8b ++ uqadd v1.8b, v1.8b, v29.8b ++ uqadd v2.8b, v2.8b, v30.8b ++ /* 32bpp result is in {v0, v1, v2, XX} */ ++ convert_8888_to_0565 v2, v1, v0, v14, v30, v13 ++ mov v28.d[0], v14.d[0] ++ mov v29.d[0], v14.d[1] ++.endm ++ ++/* TODO: expand macros and do better instructions scheduling */ ++.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head ++ fetch_mask_pixblock ++ pixman_composite_over_0565_8_0565_process_pixblock_tail ++ fetch_src_pixblock ++ ld1 {v10.4h, v11.4h}, [DST_R], #16 ++ cache_preload 8, 8 ++ pixman_composite_over_0565_8_0565_process_pixblock_head ++ st1 {v14.8h}, [DST_W], #16 ++.endm ++ ++generate_composite_function \ ++ pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \ ++ FLAG_DST_READWRITE, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 5, /* prefetch distance */ \ ++ default_init_need_all_regs, \ ++ default_cleanup_need_all_regs, \ ++ pixman_composite_over_0565_8_0565_process_pixblock_head, \ ++ pixman_composite_over_0565_8_0565_process_pixblock_tail, \ ++ pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ ++ 28, /* dst_w_basereg */ \ ++ 10, /* dst_r_basereg */ \ ++ 8, /* src_basereg */ \ ++ 15 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_over_0565_n_0565_init ++ mov v15.s[0], w6 ++ dup v15.8b, v15.b[3] ++.endm ++ ++.macro pixman_composite_over_0565_n_0565_cleanup ++.endm ++ ++generate_composite_function \ ++ pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \ ++ FLAG_DST_READWRITE, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 5, /* prefetch distance */ \ ++ pixman_composite_over_0565_n_0565_init, \ ++ pixman_composite_over_0565_n_0565_cleanup, \ ++ pixman_composite_over_0565_8_0565_process_pixblock_head, \ ++ pixman_composite_over_0565_8_0565_process_pixblock_tail, \ ++ pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ ++ 28, /* dst_w_basereg */ \ ++ 10, /* dst_r_basereg */ \ ++ 8, /* src_basereg */ \ ++ 15 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_add_0565_8_0565_process_pixblock_head ++ /* mask is in v15 */ ++ mov v4.d[0], v8.d[0] ++ mov v4.d[1], v9.d[0] ++ mov v13.d[0], v10.d[0] ++ mov v13.d[1], v11.d[0] ++ convert_0565_to_x888 v4, v2, v1, v0 ++ convert_0565_to_x888 v13, v6, v5, v4 ++ /* source pixel data is in {v0, v1, v2, XX} */ ++ /* destination pixel data is in {v4, v5, v6, XX} */ ++ umull v9.8h, v15.8b, v2.8b ++ umull v8.8h, v15.8b, v1.8b ++ umull v7.8h, v15.8b, v0.8b ++ urshr v12.8h, v9.8h, #8 ++ urshr v11.8h, v8.8h, #8 ++ urshr v10.8h, v7.8h, #8 ++ raddhn v2.8b, v9.8h, v12.8h ++ raddhn v1.8b, v8.8h, v11.8h ++ raddhn v0.8b, v7.8h, v10.8h ++.endm ++ ++.macro pixman_composite_add_0565_8_0565_process_pixblock_tail ++ uqadd v0.8b, v0.8b, v4.8b ++ uqadd v1.8b, v1.8b, v5.8b ++ uqadd v2.8b, v2.8b, v6.8b ++ /* 32bpp result is in {v0, v1, v2, XX} */ ++ convert_8888_to_0565 v2, v1, v0, v14, v30, v13 ++ mov v28.d[0], v14.d[0] ++ mov v29.d[0], v14.d[1] ++.endm ++ ++/* TODO: expand macros and do better instructions scheduling */ ++.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head ++ fetch_mask_pixblock ++ pixman_composite_add_0565_8_0565_process_pixblock_tail ++ fetch_src_pixblock ++ ld1 {v10.4h, v11.4h}, [DST_R], #16 ++ cache_preload 8, 8 ++ pixman_composite_add_0565_8_0565_process_pixblock_head ++ st1 {v14.8h}, [DST_W], #16 ++.endm ++ ++generate_composite_function \ ++ pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \ ++ FLAG_DST_READWRITE, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 5, /* prefetch distance */ \ ++ default_init_need_all_regs, \ ++ default_cleanup_need_all_regs, \ ++ pixman_composite_add_0565_8_0565_process_pixblock_head, \ ++ pixman_composite_add_0565_8_0565_process_pixblock_tail, \ ++ pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \ ++ 28, /* dst_w_basereg */ \ ++ 10, /* dst_r_basereg */ \ ++ 8, /* src_basereg */ \ ++ 15 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_out_reverse_8_0565_process_pixblock_head ++ /* mask is in v15 */ ++ mov v12.d[0], v10.d[0] ++ mov v12.d[1], v11.d[0] ++ convert_0565_to_x888 v12, v6, v5, v4 ++ /* destination pixel data is in {v4, v5, v6, xx} */ ++ mvn v24.8b, v15.8b /* get inverted alpha */ ++ /* now do alpha blending */ ++ umull v8.8h, v24.8b, v4.8b ++ umull v9.8h, v24.8b, v5.8b ++ umull v10.8h, v24.8b, v6.8b ++.endm ++ ++.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail ++ urshr v11.8h, v8.8h, #8 ++ urshr v12.8h, v9.8h, #8 ++ urshr v13.8h, v10.8h, #8 ++ raddhn v0.8b, v11.8h, v8.8h ++ raddhn v1.8b, v12.8h, v9.8h ++ raddhn v2.8b, v13.8h, v10.8h ++ /* 32bpp result is in {v0, v1, v2, XX} */ ++ convert_8888_to_0565 v2, v1, v0, v14, v12, v3 ++ mov v28.d[0], v14.d[0] ++ mov v29.d[0], v14.d[1] ++.endm ++ ++/* TODO: expand macros and do better instructions scheduling */ ++.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head ++ fetch_src_pixblock ++ pixman_composite_out_reverse_8_0565_process_pixblock_tail ++ ld1 {v10.4h, v11.4h}, [DST_R], #16 ++ cache_preload 8, 8 ++ pixman_composite_out_reverse_8_0565_process_pixblock_head ++ st1 {v14.8h}, [DST_W], #16 ++.endm ++ ++generate_composite_function \ ++ pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \ ++ FLAG_DST_READWRITE, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 5, /* prefetch distance */ \ ++ default_init_need_all_regs, \ ++ default_cleanup_need_all_regs, \ ++ pixman_composite_out_reverse_8_0565_process_pixblock_head, \ ++ pixman_composite_out_reverse_8_0565_process_pixblock_tail, \ ++ pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \ ++ 28, /* dst_w_basereg */ \ ++ 10, /* dst_r_basereg */ \ ++ 15, /* src_basereg */ \ ++ 0 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++.macro pixman_composite_out_reverse_8_8888_process_pixblock_head ++ /* src is in v0 */ ++ /* destination pixel data is in {v4, v5, v6, v7} */ ++ mvn v1.8b, v0.8b /* get inverted alpha */ ++ /* now do alpha blending */ ++ umull v8.8h, v1.8b, v4.8b ++ umull v9.8h, v1.8b, v5.8b ++ umull v10.8h, v1.8b, v6.8b ++ umull v11.8h, v1.8b, v7.8b ++.endm ++ ++.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail ++ urshr v14.8h, v8.8h, #8 ++ urshr v15.8h, v9.8h, #8 ++ urshr v12.8h, v10.8h, #8 ++ urshr v13.8h, v11.8h, #8 ++ raddhn v28.8b, v14.8h, v8.8h ++ raddhn v29.8b, v15.8h, v9.8h ++ raddhn v30.8b, v12.8h, v10.8h ++ raddhn v31.8b, v13.8h, v11.8h ++ /* 32bpp result is in {v28, v29, v30, v31} */ ++.endm ++ ++/* TODO: expand macros and do better instructions scheduling */ ++.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head ++ fetch_src_pixblock ++ pixman_composite_out_reverse_8_8888_process_pixblock_tail ++ ld4 {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32 ++ cache_preload 8, 8 ++ pixman_composite_out_reverse_8_8888_process_pixblock_head ++ st4 {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32 ++.endm ++ ++generate_composite_function \ ++ pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \ ++ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ 5, /* prefetch distance */ \ ++ default_init, \ ++ default_cleanup, \ ++ pixman_composite_out_reverse_8_8888_process_pixblock_head, \ ++ pixman_composite_out_reverse_8_8888_process_pixblock_tail, \ ++ pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \ ++ 28, /* dst_w_basereg */ \ ++ 4, /* dst_r_basereg */ \ ++ 0, /* src_basereg */ \ ++ 0 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++generate_composite_function_nearest_scanline \ ++ pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \ ++ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ default_init, \ ++ default_cleanup, \ ++ pixman_composite_over_8888_8888_process_pixblock_head, \ ++ pixman_composite_over_8888_8888_process_pixblock_tail, \ ++ pixman_composite_over_8888_8888_process_pixblock_tail_head ++ ++generate_composite_function_nearest_scanline \ ++ pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \ ++ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ default_init, \ ++ default_cleanup, \ ++ pixman_composite_over_8888_0565_process_pixblock_head, \ ++ pixman_composite_over_8888_0565_process_pixblock_tail, \ ++ pixman_composite_over_8888_0565_process_pixblock_tail_head, \ ++ 28, /* dst_w_basereg */ \ ++ 4, /* dst_r_basereg */ \ ++ 0, /* src_basereg */ \ ++ 24 /* mask_basereg */ ++ ++generate_composite_function_nearest_scanline \ ++ pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \ ++ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ default_init, \ ++ default_cleanup, \ ++ pixman_composite_src_8888_0565_process_pixblock_head, \ ++ pixman_composite_src_8888_0565_process_pixblock_tail, \ ++ pixman_composite_src_8888_0565_process_pixblock_tail_head, \ ++ ++generate_composite_function_nearest_scanline \ ++ pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \ ++ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ default_init, \ ++ default_cleanup, \ ++ pixman_composite_src_0565_8888_process_pixblock_head, \ ++ pixman_composite_src_0565_8888_process_pixblock_tail, \ ++ pixman_composite_src_0565_8888_process_pixblock_tail_head ++ ++generate_composite_function_nearest_scanline \ ++ pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \ ++ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ default_init_need_all_regs, \ ++ default_cleanup_need_all_regs, \ ++ pixman_composite_over_8888_8_0565_process_pixblock_head, \ ++ pixman_composite_over_8888_8_0565_process_pixblock_tail, \ ++ pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \ ++ 28, /* dst_w_basereg */ \ ++ 4, /* dst_r_basereg */ \ ++ 8, /* src_basereg */ \ ++ 24 /* mask_basereg */ ++ ++generate_composite_function_nearest_scanline \ ++ pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \ ++ FLAG_DST_READWRITE, \ ++ 8, /* number of pixels, processed in a single block */ \ ++ default_init_need_all_regs, \ ++ default_cleanup_need_all_regs, \ ++ pixman_composite_over_0565_8_0565_process_pixblock_head, \ ++ pixman_composite_over_0565_8_0565_process_pixblock_tail, \ ++ pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \ ++ 28, /* dst_w_basereg */ \ ++ 10, /* dst_r_basereg */ \ ++ 8, /* src_basereg */ \ ++ 15 /* mask_basereg */ ++ ++/******************************************************************************/ ++ ++/* ++ * Bilinear scaling support code which tries to provide pixel fetching, color ++ * format conversion, and interpolation as separate macros which can be used ++ * as the basic building blocks for constructing bilinear scanline functions. ++ */ ++ ++.macro bilinear_load_8888 reg1, reg2, tmp ++ asr TMP1, X, #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, lsl #2 ++ ld1 {®1&.2s}, [TMP1], STRIDE ++ ld1 {®2&.2s}, [TMP1] ++.endm ++ ++.macro bilinear_load_0565 reg1, reg2, tmp ++ asr TMP1, X, #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, lsl #1 ++ ld1 {®2&.s}[0], [TMP1], STRIDE ++ ld1 {®2&.s}[1], [TMP1] ++ convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp ++.endm ++ ++.macro bilinear_load_and_vertical_interpolate_two_8888 \ ++ acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2 ++ ++ bilinear_load_8888 reg1, reg2, tmp1 ++ umull &acc1&.8h, ®1&.8b, v28.8b ++ umlal &acc1&.8h, ®2&.8b, v29.8b ++ bilinear_load_8888 reg3, reg4, tmp2 ++ umull &acc2&.8h, ®3&.8b, v28.8b ++ umlal &acc2&.8h, ®4&.8b, v29.8b ++.endm ++ ++.macro bilinear_load_and_vertical_interpolate_four_8888 \ ++ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ ++ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi ++ ++ bilinear_load_and_vertical_interpolate_two_8888 \ ++ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi ++ bilinear_load_and_vertical_interpolate_two_8888 \ ++ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi ++.endm ++ ++.macro vzip reg1, reg2 ++ umov TMP4, v31.d[0] ++ zip1 v31.8b, reg1, reg2 ++ zip2 reg2, reg1, reg2 ++ mov reg1, v31.8b ++ mov v31.d[0], TMP4 ++.endm ++ ++.macro vuzp reg1, reg2 ++ umov TMP4, v31.d[0] ++ uzp1 v31.8b, reg1, reg2 ++ uzp2 reg2, reg1, reg2 ++ mov reg1, v31.8b ++ mov v31.d[0], TMP4 ++.endm ++ ++.macro bilinear_load_and_vertical_interpolate_two_0565 \ ++ acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi ++ asr TMP1, X, #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, lsl #1 ++ asr TMP2, X, #16 ++ add X, X, UX ++ add TMP2, TOP, TMP2, lsl #1 ++ ld1 {&acc2&.s}[0], [TMP1], STRIDE ++ ld1 {&acc2&.s}[2], [TMP2], STRIDE ++ ld1 {&acc2&.s}[1], [TMP1] ++ ld1 {&acc2&.s}[3], [TMP2] ++ convert_0565_to_x888 acc2, reg3, reg2, reg1 ++ vzip ®1&.8b, ®3&.8b ++ vzip ®2&.8b, ®4&.8b ++ vzip ®3&.8b, ®4&.8b ++ vzip ®1&.8b, ®2&.8b ++ umull &acc1&.8h, ®1&.8b, v28.8b ++ umlal &acc1&.8h, ®2&.8b, v29.8b ++ umull &acc2&.8h, ®3&.8b, v28.8b ++ umlal &acc2&.8h, ®4&.8b, v29.8b ++.endm ++ ++.macro bilinear_load_and_vertical_interpolate_four_0565 \ ++ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \ ++ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi ++ asr TMP1, X, #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, lsl #1 ++ asr TMP2, X, #16 ++ add X, X, UX ++ add TMP2, TOP, TMP2, lsl #1 ++ ld1 {&xacc2&.s}[0], [TMP1], STRIDE ++ ld1 {&xacc2&.s}[2], [TMP2], STRIDE ++ ld1 {&xacc2&.s}[1], [TMP1] ++ ld1 {&xacc2&.s}[3], [TMP2] ++ convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1 ++ asr TMP1, X, #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, lsl #1 ++ asr TMP2, X, #16 ++ add X, X, UX ++ add TMP2, TOP, TMP2, lsl #1 ++ ld1 {&yacc2&.s}[0], [TMP1], STRIDE ++ vzip &xreg1&.8b, &xreg3&.8b ++ ld1 {&yacc2&.s}[2], [TMP2], STRIDE ++ vzip &xreg2&.8b, &xreg4&.8b ++ ld1 {&yacc2&.s}[1], [TMP1] ++ vzip &xreg3&.8b, &xreg4&.8b ++ ld1 {&yacc2&.s}[3], [TMP2] ++ vzip &xreg1&.8b, &xreg2&.8b ++ convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1 ++ umull &xacc1&.8h, &xreg1&.8b, v28.8b ++ vzip &yreg1&.8b, &yreg3&.8b ++ umlal &xacc1&.8h, &xreg2&.8b, v29.8b ++ vzip &yreg2&.8b, &yreg4&.8b ++ umull &xacc2&.8h, &xreg3&.8b, v28.8b ++ vzip &yreg3&.8b, &yreg4&.8b ++ umlal &xacc2&.8h, &xreg4&.8b, v29.8b ++ vzip &yreg1&.8b, &yreg2&.8b ++ umull &yacc1&.8h, &yreg1&.8b, v28.8b ++ umlal &yacc1&.8h, &yreg2&.8b, v29.8b ++ umull &yacc2&.8h, &yreg3&.8b, v28.8b ++ umlal &yacc2&.8h, &yreg4&.8b, v29.8b ++.endm ++ ++.macro bilinear_store_8888 numpix, tmp1, tmp2 ++.if numpix == 4 ++ st1 {v0.2s, v1.2s}, [OUT], #16 ++.elseif numpix == 2 ++ st1 {v0.2s}, [OUT], #8 ++.elseif numpix == 1 ++ st1 {v0.s}[0], [OUT], #4 ++.else ++ .error bilinear_store_8888 numpix is unsupported ++.endif ++.endm ++ ++.macro bilinear_store_0565 numpix, tmp1, tmp2 ++ vuzp v0.8b, v1.8b ++ vuzp v2.8b, v3.8b ++ vuzp v1.8b, v3.8b ++ vuzp v0.8b, v2.8b ++ convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2 ++.if numpix == 4 ++ st1 {v1.4h}, [OUT], #8 ++.elseif numpix == 2 ++ st1 {v1.s}[0], [OUT], #4 ++.elseif numpix == 1 ++ st1 {v1.h}[0], [OUT], #2 ++.else ++ .error bilinear_store_0565 numpix is unsupported ++.endif ++.endm ++ ++.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt ++ bilinear_load_&src_fmt v0, v1, v2 ++ umull v2.8h, v0.8b, v28.8b ++ umlal v2.8h, v1.8b, v29.8b ++ /* 5 cycles bubble */ ++ ushll v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS ++ umlsl v0.4s, v2.4h, v15.h[0] ++ umlal2 v0.4s, v2.8h, v15.h[0] ++ /* 5 cycles bubble */ ++ shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ /* 3 cycles bubble */ ++ xtn v0.8b, v0.8h ++ /* 1 cycle bubble */ ++ bilinear_store_&dst_fmt 1, v3, v4 ++.endm ++ ++.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt ++ bilinear_load_and_vertical_interpolate_two_&src_fmt \ ++ v1, v11, v2, v3, v20, v21, v22, v23 ++ ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS ++ umlsl v0.4s, v1.4h, v15.h[0] ++ umlal2 v0.4s, v1.8h, v15.h[0] ++ ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS ++ umlsl v10.4s, v11.4h, v15.h[4] ++ umlal2 v10.4s, v11.8h, v15.h[4] ++ shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) ++ add v12.8h, v12.8h, v13.8h ++ xtn v0.8b, v0.8h ++ bilinear_store_&dst_fmt 2, v3, v4 ++.endm ++ ++.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt ++ bilinear_load_and_vertical_interpolate_four_&src_fmt \ ++ v1, v11, v14, v20, v16, v17, v22, v23 \ ++ v3, v9, v24, v25, v26, v27, v18, v19 ++ prfm PREFETCH_MODE, [TMP1, PF_OFFS] ++ sub TMP1, TMP1, STRIDE ++ ushll v0.4s, v1.4h, #BILINEAR_INTERPOLATION_BITS ++ umlsl v0.4s, v1.4h, v15.h[0] ++ umlal2 v0.4s, v1.8h, v15.h[0] ++ ushll v10.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS ++ umlsl v10.4s, v11.4h, v15.h[4] ++ umlal2 v10.4s, v11.8h, v15.h[4] ++ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) ++ ushll v2.4s, v3.4h, #BILINEAR_INTERPOLATION_BITS ++ umlsl v2.4s, v3.4h, v15.h[0] ++ umlal2 v2.4s, v3.8h, v15.h[0] ++ ushll v8.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS ++ prfm PREFETCH_MODE, [TMP2, PF_OFFS] ++ umlsl v8.4s, v9.4h, v15.h[4] ++ umlal2 v8.4s, v9.8h, v15.h[4] ++ add v12.8h, v12.8h, v13.8h ++ shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ shrn2 v0.8h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ shrn2 v2.8h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) ++ xtn v0.8b, v0.8h ++ xtn v1.8b, v2.8h ++ add v12.8h, v12.8h, v13.8h ++ bilinear_store_&dst_fmt 4, v3, v4 ++.endm ++ ++.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt ++.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt ++ bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head ++.else ++ bilinear_interpolate_four_pixels src_fmt, dst_fmt ++.endif ++.endm ++ ++.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt ++.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt ++ bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail ++.endif ++.endm ++ ++.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt ++.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt ++ bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head ++.else ++ bilinear_interpolate_four_pixels src_fmt, dst_fmt ++.endif ++.endm ++ ++.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt ++.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt ++ bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head ++.else ++ bilinear_interpolate_four_pixels_head src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt ++.endif ++.endm ++ ++.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt ++.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt ++ bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail ++.else ++ bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt ++.endif ++.endm ++ ++.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt ++.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt ++ bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head ++.else ++ bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt ++ bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt ++.endif ++.endm ++ ++.set BILINEAR_FLAG_UNROLL_4, 0 ++.set BILINEAR_FLAG_UNROLL_8, 1 ++.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2 ++ ++/* ++ * Main template macro for generating NEON optimized bilinear scanline ++ * functions. ++ * ++ * Bilinear scanline scaler macro template uses the following arguments: ++ * fname - name of the function to generate ++ * src_fmt - source color format (8888 or 0565) ++ * dst_fmt - destination color format (8888 or 0565) ++ * bpp_shift - (1 << bpp_shift) is the size of source pixel in bytes ++ * prefetch_distance - prefetch in the source image by that many ++ * pixels ahead ++ */ ++ ++.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \ ++ src_bpp_shift, dst_bpp_shift, \ ++ prefetch_distance, flags ++ ++pixman_asm_function fname ++ OUT .req x0 ++ TOP .req x1 ++ BOTTOM .req x2 ++ WT .req x3 ++ WB .req x4 ++ X .req x5 ++ UX .req x6 ++ WIDTH .req x7 ++ TMP1 .req x8 ++ TMP2 .req x9 ++ PF_OFFS .req x10 ++ TMP3 .req x11 ++ TMP4 .req x12 ++ STRIDE .req x13 ++ ++ sxtw x3, w3 ++ sxtw x4, w4 ++ sxtw x5, w5 ++ sxtw x6, w6 ++ sxtw x7, w7 ++ ++ stp x29, x30, [sp, -16]! ++ mov x29, sp ++ sub sp, sp, 112 /* push all registers */ ++ sub x29, x29, 64 ++ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32 ++ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32 ++ stp x8, x9, [x29, -80] ++ stp x10, x11, [x29, -96] ++ stp x12, x13, [x29, -112] ++ ++ mov PF_OFFS, #prefetch_distance ++ mul PF_OFFS, PF_OFFS, UX ++ ++ subs STRIDE, BOTTOM, TOP ++ .unreq BOTTOM ++ ++ cmp WIDTH, #0 ++ ble 300f ++ ++ dup v12.8h, w5 ++ dup v13.8h, w6 ++ dup v28.8b, w3 ++ dup v29.8b, w4 ++ mov v25.d[0], v12.d[1] ++ mov v26.d[0], v13.d[0] ++ add v25.4h, v25.4h, v26.4h ++ mov v12.d[1], v25.d[0] ++ ++ /* ensure good destination alignment */ ++ cmp WIDTH, #1 ++ blt 100f ++ tst OUT, #(1 << dst_bpp_shift) ++ beq 100f ++ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) ++ add v12.8h, v12.8h, v13.8h ++ bilinear_interpolate_last_pixel src_fmt, dst_fmt ++ sub WIDTH, WIDTH, #1 ++100: ++ add v13.8h, v13.8h, v13.8h ++ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) ++ add v12.8h, v12.8h, v13.8h ++ ++ cmp WIDTH, #2 ++ blt 100f ++ tst OUT, #(1 << (dst_bpp_shift + 1)) ++ beq 100f ++ bilinear_interpolate_two_pixels src_fmt, dst_fmt ++ sub WIDTH, WIDTH, #2 ++100: ++.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0 ++/*********** 8 pixels per iteration *****************/ ++ cmp WIDTH, #4 ++ blt 100f ++ tst OUT, #(1 << (dst_bpp_shift + 2)) ++ beq 100f ++ bilinear_interpolate_four_pixels src_fmt, dst_fmt ++ sub WIDTH, WIDTH, #4 ++100: ++ subs WIDTH, WIDTH, #8 ++ blt 100f ++ asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift) ++ bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt ++ subs WIDTH, WIDTH, #8 ++ blt 500f ++1000: ++ bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt ++ subs WIDTH, WIDTH, #8 ++ bge 1000b ++500: ++ bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt ++100: ++ tst WIDTH, #4 ++ beq 200f ++ bilinear_interpolate_four_pixels src_fmt, dst_fmt ++200: ++.else ++/*********** 4 pixels per iteration *****************/ ++ subs WIDTH, WIDTH, #4 ++ blt 100f ++ asr PF_OFFS, PF_OFFS, #(16 - src_bpp_shift) ++ bilinear_interpolate_four_pixels_head src_fmt, dst_fmt ++ subs WIDTH, WIDTH, #4 ++ blt 500f ++1000: ++ bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt ++ subs WIDTH, WIDTH, #4 ++ bge 1000b ++500: ++ bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt ++100: ++/****************************************************/ ++.endif ++ /* handle the remaining trailing pixels */ ++ tst WIDTH, #2 ++ beq 200f ++ bilinear_interpolate_two_pixels src_fmt, dst_fmt ++200: ++ tst WIDTH, #1 ++ beq 300f ++ bilinear_interpolate_last_pixel src_fmt, dst_fmt ++300: ++ sub x29, x29, 64 ++ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32 ++ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32 ++ ldp x8, x9, [x29, -80] ++ ldp x10, x11, [x29, -96] ++ ldp x12, x13, [x29, -104] ++ mov sp, x29 ++ ldp x29, x30, [sp], 16 ++ ret ++ ++ .unreq OUT ++ .unreq TOP ++ .unreq WT ++ .unreq WB ++ .unreq X ++ .unreq UX ++ .unreq WIDTH ++ .unreq TMP1 ++ .unreq TMP2 ++ .unreq PF_OFFS ++ .unreq TMP3 ++ .unreq TMP4 ++ .unreq STRIDE ++.endfunc ++ ++.endm ++ ++/*****************************************************************************/ ++ ++.set have_bilinear_interpolate_four_pixels_8888_8888, 1 ++ ++.macro bilinear_interpolate_four_pixels_8888_8888_head ++ asr TMP1, X, #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, lsl #2 ++ asr TMP2, X, #16 ++ add X, X, UX ++ add TMP2, TOP, TMP2, lsl #2 ++ ++ ld1 {v22.2s}, [TMP1], STRIDE ++ ld1 {v23.2s}, [TMP1] ++ asr TMP3, X, #16 ++ add X, X, UX ++ add TMP3, TOP, TMP3, lsl #2 ++ umull v8.8h, v22.8b, v28.8b ++ umlal v8.8h, v23.8b, v29.8b ++ ++ ld1 {v22.2s}, [TMP2], STRIDE ++ ld1 {v23.2s}, [TMP2] ++ asr TMP4, X, #16 ++ add X, X, UX ++ add TMP4, TOP, TMP4, lsl #2 ++ umull v9.8h, v22.8b, v28.8b ++ umlal v9.8h, v23.8b, v29.8b ++ ++ ld1 {v22.2s}, [TMP3], STRIDE ++ ld1 {v23.2s}, [TMP3] ++ umull v10.8h, v22.8b, v28.8b ++ umlal v10.8h, v23.8b, v29.8b ++ ++ ushll v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS ++ umlsl v0.4s, v8.4h, v15.h[0] ++ umlal2 v0.4s, v8.8h, v15.h[0] ++ ++ prfm PREFETCH_MODE, [TMP4, PF_OFFS] ++ ld1 {v16.2s}, [TMP4], STRIDE ++ ld1 {v17.2s}, [TMP4] ++ prfm PREFETCH_MODE, [TMP4, PF_OFFS] ++ umull v11.8h, v16.8b, v28.8b ++ umlal v11.8h, v17.8b, v29.8b ++ ++ ushll v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS ++ umlsl v1.4s, v9.4h, v15.h[4] ++.endm ++ ++.macro bilinear_interpolate_four_pixels_8888_8888_tail ++ umlal2 v1.4s, v9.8h, v15.h[4] ++ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) ++ ushll v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS ++ umlsl v2.4s, v10.4h, v15.h[0] ++ umlal2 v2.4s, v10.8h, v15.h[0] ++ ushll v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS ++ umlsl v3.4s, v11.4h, v15.h[4] ++ umlal2 v3.4s, v11.8h, v15.h[4] ++ add v12.8h, v12.8h, v13.8h ++ shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ shrn2 v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ shrn v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) ++ shrn2 v2.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ xtn v6.8b, v0.8h ++ xtn v7.8b, v2.8h ++ add v12.8h, v12.8h, v13.8h ++ st1 {v6.2s, v7.2s}, [OUT], #16 ++.endm ++ ++.macro bilinear_interpolate_four_pixels_8888_8888_tail_head ++ asr TMP1, X, #16 ++ add X, X, UX ++ add TMP1, TOP, TMP1, lsl #2 ++ asr TMP2, X, #16 ++ add X, X, UX ++ add TMP2, TOP, TMP2, lsl #2 ++ umlal2 v1.4s, v9.8h, v15.h[4] ++ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) ++ ushll v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS ++ umlsl v2.4s, v10.4h, v15.h[0] ++ umlal2 v2.4s, v10.8h, v15.h[0] ++ ushll v3.4s, v11.4h, #BILINEAR_INTERPOLATION_BITS ++ ld1 {v20.2s}, [TMP1], STRIDE ++ umlsl v3.4s, v11.4h, v15.h[4] ++ umlal2 v3.4s, v11.8h, v15.h[4] ++ ld1 {v21.2s}, [TMP1] ++ umull v8.8h, v20.8b, v28.8b ++ umlal v8.8h, v21.8b, v29.8b ++ shrn v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ shrn2 v0.8h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ shrn v4.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ ld1 {v22.2s}, [TMP2], STRIDE ++ shrn2 v4.8h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS) ++ add v12.8h, v12.8h, v13.8h ++ ld1 {v23.2s}, [TMP2] ++ umull v9.8h, v22.8b, v28.8b ++ asr TMP3, X, #16 ++ add X, X, UX ++ add TMP3, TOP, TMP3, lsl #2 ++ asr TMP4, X, #16 ++ add X, X, UX ++ add TMP4, TOP, TMP4, lsl #2 ++ umlal v9.8h, v23.8b, v29.8b ++ ld1 {v22.2s}, [TMP3], STRIDE ++ ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) ++ ld1 {v23.2s}, [TMP3] ++ umull v10.8h, v22.8b, v28.8b ++ umlal v10.8h, v23.8b, v29.8b ++ xtn v6.8b, v0.8h ++ ushll v0.4s, v8.4h, #BILINEAR_INTERPOLATION_BITS ++ xtn v7.8b, v4.8h ++ umlsl v0.4s, v8.4h, v15.h[0] ++ umlal2 v0.4s, v8.8h, v15.h[0] ++ prfm PREFETCH_MODE, [TMP4, PF_OFFS] ++ ld1 {v16.2s}, [TMP4], STRIDE ++ add v12.8h, v12.8h, v13.8h ++ ld1 {v17.2s}, [TMP4] ++ prfm PREFETCH_MODE, [TMP4, PF_OFFS] ++ umull v11.8h, v16.8b, v28.8b ++ umlal v11.8h, v17.8b, v29.8b ++ st1 {v6.2s, v7.2s}, [OUT], #16 ++ ushll v1.4s, v9.4h, #BILINEAR_INTERPOLATION_BITS ++ umlsl v1.4s, v9.4h, v15.h[4] ++.endm ++ ++/*****************************************************************************/ ++ ++generate_bilinear_scanline_func \ ++ pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \ ++ 2, 2, 28, BILINEAR_FLAG_UNROLL_4 ++ ++generate_bilinear_scanline_func \ ++ pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \ ++ 2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS ++ ++generate_bilinear_scanline_func \ ++ pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \ ++ 1, 2, 28, BILINEAR_FLAG_UNROLL_4 ++ ++generate_bilinear_scanline_func \ ++ pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \ ++ 1, 1, 28, BILINEAR_FLAG_UNROLL_4 +diff --git a/pixman/pixman-arma64-neon-asm.h b/pixman/pixman-arma64-neon-asm.h +new file mode 100644 +index 0000000..0389d12 +--- /dev/null ++++ b/pixman/pixman-arma64-neon-asm.h +@@ -0,0 +1,1310 @@ ++/* ++ * Copyright © 2009 Nokia Corporation ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++ * DEALINGS IN THE SOFTWARE. ++ * ++ * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) ++ */ ++ ++/* ++ * This file contains a macro ('generate_composite_function') which can ++ * construct 2D image processing functions, based on a common template. ++ * Any combinations of source, destination and mask images with 8bpp, ++ * 16bpp, 24bpp, 32bpp color formats are supported. ++ * ++ * This macro takes care of: ++ * - handling of leading and trailing unaligned pixels ++ * - doing most of the work related to L2 cache preload ++ * - encourages the use of software pipelining for better instructions ++ * scheduling ++ * ++ * The user of this macro has to provide some configuration parameters ++ * (bit depths for the images, prefetch distance, etc.) and a set of ++ * macros, which should implement basic code chunks responsible for ++ * pixels processing. See 'pixman-armv8-neon-asm.S' file for the usage ++ * examples. ++ * ++ * TODO: ++ * - try overlapped pixel method (from Ian Rickards) when processing ++ * exactly two blocks of pixels ++ * - maybe add an option to do reverse scanline processing ++ */ ++ ++/* ++ * Bit flags for 'generate_composite_function' macro which are used ++ * to tune generated functions behavior. ++ */ ++.set FLAG_DST_WRITEONLY, 0 ++.set FLAG_DST_READWRITE, 1 ++.set FLAG_DEINTERLEAVE_32BPP, 2 ++ ++/* ++ * Constants for selecting preferable prefetch type. ++ */ ++.set PREFETCH_TYPE_NONE, 0 /* No prefetch at all */ ++.set PREFETCH_TYPE_SIMPLE, 1 /* A simple, fixed-distance-ahead prefetch */ ++.set PREFETCH_TYPE_ADVANCED, 2 /* Advanced fine-grained prefetch */ ++ ++/* ++ * prefetch mode ++ * available modes are: ++ * pldl1keep ++ * pldl1strm ++ * pldl2keep ++ * pldl2strm ++ * pldl3keep ++ * pldl3strm ++ */ ++#define PREFETCH_MODE pldl1keep ++ ++/* ++ * Definitions of supplementary pixld/pixst macros (for partial load/store of ++ * pixel data). ++ */ ++ ++.macro pixldst1 op, elem_size, reg1, mem_operand, abits ++ op {v®1&.&elem_size}, [&mem_operand&], #8 ++.endm ++ ++.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits ++ op {v®1&.&elem_size, v®2&.&elem_size}, [&mem_operand&], #16 ++.endm ++ ++.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits ++ op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size, v®4&.&elem_size}, [&mem_operand&], #32 ++.endm ++ ++.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits, bytes ++ op {v®1&.&elem_size}[idx], [&mem_operand&], #&bytes& ++.endm ++ ++.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand ++ op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size}, [&mem_operand&], #24 ++.endm ++ ++.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand ++ op {v®1&.&elem_size, v®2&.&elem_size, v®3&.&elem_size}[idx], [&mem_operand&], #3 ++.endm ++ ++.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits ++.if numbytes == 32 ++ .if elem_size==32 ++ pixldst4 op, 2s, %(basereg+4), %(basereg+5), \ ++ %(basereg+6), %(basereg+7), mem_operand, abits ++ .elseif elem_size==16 ++ pixldst4 op, 4h, %(basereg+4), %(basereg+5), \ ++ %(basereg+6), %(basereg+7), mem_operand, abits ++ .else ++ pixldst4 op, 8b, %(basereg+4), %(basereg+5), \ ++ %(basereg+6), %(basereg+7), mem_operand, abits ++ .endif ++.elseif numbytes == 16 ++ .if elem_size==32 ++ pixldst2 op, 2s, %(basereg+2), %(basereg+3), mem_operand, abits ++ .elseif elem_size==16 ++ pixldst2 op, 4h, %(basereg+2), %(basereg+3), mem_operand, abits ++ .else ++ pixldst2 op, 8b, %(basereg+2), %(basereg+3), mem_operand, abits ++ .endif ++.elseif numbytes == 8 ++ .if elem_size==32 ++ pixldst1 op, 2s, %(basereg+1), mem_operand, abits ++ .elseif elem_size==16 ++ pixldst1 op, 4h, %(basereg+1), mem_operand, abits ++ .else ++ pixldst1 op, 8b, %(basereg+1), mem_operand, abits ++ .endif ++.elseif numbytes == 4 ++ .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32) ++ pixldst0 op, s, %(basereg+0), 1, mem_operand, abits, 4 ++ .elseif elem_size == 16 ++ pixldst0 op, h, %(basereg+0), 2, mem_operand, abits, 2 ++ pixldst0 op, h, %(basereg+0), 3, mem_operand, abits, 2 ++ .else ++ pixldst0 op, b, %(basereg+0), 4, mem_operand, abits, 1 ++ pixldst0 op, b, %(basereg+0), 5, mem_operand, abits, 1 ++ pixldst0 op, b, %(basereg+0), 6, mem_operand, abits, 1 ++ pixldst0 op, b, %(basereg+0), 7, mem_operand, abits, 1 ++ .endif ++.elseif numbytes == 2 ++ .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16) ++ pixldst0 op, h, %(basereg+0), 1, mem_operand, abits, 2 ++ .else ++ pixldst0 op, b, %(basereg+0), 2, mem_operand, abits, 1 ++ pixldst0 op, b, %(basereg+0), 3, mem_operand, abits, 1 ++ .endif ++.elseif numbytes == 1 ++ pixldst0 op, b, %(basereg+0), 1, mem_operand, abits, 1 ++.else ++ .error "unsupported size: numbytes" ++.endif ++.endm ++ ++.macro pixld numpix, bpp, basereg, mem_operand, abits=0 ++.if bpp > 0 ++.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) ++ pixldst4 ld4, 8b, %(basereg+4), %(basereg+5), \ ++ %(basereg+6), %(basereg+7), mem_operand, abits ++.elseif (bpp == 24) && (numpix == 8) ++ pixldst3 ld3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand ++.elseif (bpp == 24) && (numpix == 4) ++ pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand ++ pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand ++ pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand ++ pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand ++.elseif (bpp == 24) && (numpix == 2) ++ pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand ++ pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand ++.elseif (bpp == 24) && (numpix == 1) ++ pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand ++.else ++ pixldst %(numpix * bpp / 8), ld1, %(bpp), basereg, mem_operand, abits ++.endif ++.endif ++.endm ++ ++.macro pixst numpix, bpp, basereg, mem_operand, abits=0 ++.if bpp > 0 ++.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) ++ pixldst4 st4, 8b, %(basereg+4), %(basereg+5), \ ++ %(basereg+6), %(basereg+7), mem_operand, abits ++.elseif (bpp == 24) && (numpix == 8) ++ pixldst3 st3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand ++.elseif (bpp == 24) && (numpix == 4) ++ pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand ++ pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand ++ pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand ++ pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand ++.elseif (bpp == 24) && (numpix == 2) ++ pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand ++ pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand ++.elseif (bpp == 24) && (numpix == 1) ++ pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand ++.elseif numpix * bpp == 32 && abits == 32 ++ pixldst 4, st1, 32, basereg, mem_operand, abits ++.elseif numpix * bpp == 16 && abits == 16 ++ pixldst 2, st1, 16, basereg, mem_operand, abits ++.else ++ pixldst %(numpix * bpp / 8), st1, %(bpp), basereg, mem_operand, abits ++.endif ++.endif ++.endm ++ ++.macro pixld_a numpix, bpp, basereg, mem_operand ++.if (bpp * numpix) <= 128 ++ pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix) ++.else ++ pixld numpix, bpp, basereg, mem_operand, 128 ++.endif ++.endm ++ ++.macro pixst_a numpix, bpp, basereg, mem_operand ++.if (bpp * numpix) <= 128 ++ pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix) ++.else ++ pixst numpix, bpp, basereg, mem_operand, 128 ++.endif ++.endm ++ ++/* ++ * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register ++ * aliases to be defined) ++ */ ++.macro pixld1_s elem_size, reg1, mem_operand ++.if elem_size == 16 ++ asr TMP1, VX, #16 ++ adds VX, VX, UNIT_X ++ bmi 55f ++5: subs VX, VX, SRC_WIDTH_FIXED ++ bpl 5b ++55: ++ add TMP1, mem_operand, TMP1, lsl #1 ++ asr TMP2, VX, #16 ++ adds VX, VX, UNIT_X ++ bmi 55f ++5: subs VX, VX, SRC_WIDTH_FIXED ++ bpl 5b ++55: ++ add TMP2, mem_operand, TMP2, lsl #1 ++ ld1 {v®1&.h}[0], [TMP1] ++ asr TMP1, VX, #16 ++ adds VX, VX, UNIT_X ++ bmi 55f ++5: subs VX, VX, SRC_WIDTH_FIXED ++ bpl 5b ++55: ++ add TMP1, mem_operand, TMP1, lsl #1 ++ ld1 {v®1&.h}[1], [TMP2] ++ asr TMP2, VX, #16 ++ adds VX, VX, UNIT_X ++ bmi 55f ++5: subs VX, VX, SRC_WIDTH_FIXED ++ bpl 5b ++55: ++ add TMP2, mem_operand, TMP2, lsl #1 ++ ld1 {v®1&.h}[2], [TMP1] ++ ld1 {v®1&.h}[3], [TMP2] ++.elseif elem_size == 32 ++ asr TMP1, VX, #16 ++ adds VX, VX, UNIT_X ++ bmi 55f ++5: subs VX, VX, SRC_WIDTH_FIXED ++ bpl 5b ++55: ++ add TMP1, mem_operand, TMP1, lsl #2 ++ asr TMP2, VX, #16 ++ adds VX, VX, UNIT_X ++ bmi 55f ++5: subs VX, VX, SRC_WIDTH_FIXED ++ bpl 5b ++55: ++ add TMP2, mem_operand, TMP2, lsl #2 ++ ld1 {v®1&.s}[0], [TMP1] ++ ld1 {v®1&.s}[1], [TMP2] ++.else ++ .error "unsupported" ++.endif ++.endm ++ ++.macro pixld2_s elem_size, reg1, reg2, mem_operand ++.if 0 /* elem_size == 32 */ ++ mov TMP1, VX, asr #16 ++ add VX, VX, UNIT_X, asl #1 ++ add TMP1, mem_operand, TMP1, asl #2 ++ mov TMP2, VX, asr #16 ++ sub VX, VX, UNIT_X ++ add TMP2, mem_operand, TMP2, asl #2 ++ ld1 {v®1&.s}[0], [TMP1] ++ mov TMP1, VX, asr #16 ++ add VX, VX, UNIT_X, asl #1 ++ add TMP1, mem_operand, TMP1, asl #2 ++ ld1 {v®2&.s}[0], [TMP2, :32] ++ mov TMP2, VX, asr #16 ++ add VX, VX, UNIT_X ++ add TMP2, mem_operand, TMP2, asl #2 ++ ld1 {v®1&.s}[1], [TMP1] ++ ld1 {v®2&.s}[1], [TMP2] ++.else ++ pixld1_s elem_size, reg1, mem_operand ++ pixld1_s elem_size, reg2, mem_operand ++.endif ++.endm ++ ++.macro pixld0_s elem_size, reg1, idx, mem_operand ++.if elem_size == 16 ++ asr TMP1, VX, #16 ++ adds VX, VX, UNIT_X ++ bmi 55f ++5: subs VX, VX, SRC_WIDTH_FIXED ++ bpl 5b ++55: ++ add TMP1, mem_operand, TMP1, lsl #1 ++ ld1 {v®1&.h}[idx], [TMP1] ++.elseif elem_size == 32 ++ asr DUMMY, VX, #16 ++ mov TMP1, DUMMY ++ adds VX, VX, UNIT_X ++ bmi 55f ++5: subs VX, VX, SRC_WIDTH_FIXED ++ bpl 5b ++55: ++ add TMP1, mem_operand, TMP1, lsl #2 ++ ld1 {v®1&.s}[idx], [TMP1] ++.endif ++.endm ++ ++.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand ++.if numbytes == 32 ++ pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand ++ pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand ++ pixdeinterleave elem_size, %(basereg+4) ++.elseif numbytes == 16 ++ pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand ++.elseif numbytes == 8 ++ pixld1_s elem_size, %(basereg+1), mem_operand ++.elseif numbytes == 4 ++ .if elem_size == 32 ++ pixld0_s elem_size, %(basereg+0), 1, mem_operand ++ .elseif elem_size == 16 ++ pixld0_s elem_size, %(basereg+0), 2, mem_operand ++ pixld0_s elem_size, %(basereg+0), 3, mem_operand ++ .else ++ pixld0_s elem_size, %(basereg+0), 4, mem_operand ++ pixld0_s elem_size, %(basereg+0), 5, mem_operand ++ pixld0_s elem_size, %(basereg+0), 6, mem_operand ++ pixld0_s elem_size, %(basereg+0), 7, mem_operand ++ .endif ++.elseif numbytes == 2 ++ .if elem_size == 16 ++ pixld0_s elem_size, %(basereg+0), 1, mem_operand ++ .else ++ pixld0_s elem_size, %(basereg+0), 2, mem_operand ++ pixld0_s elem_size, %(basereg+0), 3, mem_operand ++ .endif ++.elseif numbytes == 1 ++ pixld0_s elem_size, %(basereg+0), 1, mem_operand ++.else ++ .error "unsupported size: numbytes" ++.endif ++.endm ++ ++.macro pixld_s numpix, bpp, basereg, mem_operand ++.if bpp > 0 ++ pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand ++.endif ++.endm ++ ++.macro vuzp8 reg1, reg2 ++ umov DUMMY, v16.d[0] ++ uzp1 v16.8b, v®1&.8b, v®2&.8b ++ uzp2 v®2&.8b, v®1&.8b, v®2&.8b ++ mov v®1&.8b, v16.8b ++ mov v16.d[0], DUMMY ++.endm ++ ++.macro vzip8 reg1, reg2 ++ umov DUMMY, v16.d[0] ++ zip1 v16.8b, v®1&.8b, v®2&.8b ++ zip2 v®2&.8b, v®1&.8b, v®2&.8b ++ mov v®1&.8b, v16.8b ++ mov v16.d[0], DUMMY ++.endm ++ ++/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ ++.macro pixdeinterleave bpp, basereg ++.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) ++ vuzp8 %(basereg+0), %(basereg+1) ++ vuzp8 %(basereg+2), %(basereg+3) ++ vuzp8 %(basereg+1), %(basereg+3) ++ vuzp8 %(basereg+0), %(basereg+2) ++.endif ++.endm ++ ++/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ ++.macro pixinterleave bpp, basereg ++.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) ++ vzip8 %(basereg+0), %(basereg+2) ++ vzip8 %(basereg+1), %(basereg+3) ++ vzip8 %(basereg+2), %(basereg+3) ++ vzip8 %(basereg+0), %(basereg+1) ++.endif ++.endm ++ ++/* ++ * This is a macro for implementing cache preload. The main idea is that ++ * cache preload logic is mostly independent from the rest of pixels ++ * processing code. It starts at the top left pixel and moves forward ++ * across pixels and can jump across scanlines. Prefetch distance is ++ * handled in an 'incremental' way: it starts from 0 and advances to the ++ * optimal distance over time. After reaching optimal prefetch distance, ++ * it is kept constant. There are some checks which prevent prefetching ++ * unneeded pixel lines below the image (but it still can prefetch a bit ++ * more data on the right side of the image - not a big issue and may ++ * be actually helpful when rendering text glyphs). Additional trick is ++ * the use of LDR instruction for prefetch instead of PLD when moving to ++ * the next line, the point is that we have a high chance of getting TLB ++ * miss in this case, and PLD would be useless. ++ * ++ * This sounds like it may introduce a noticeable overhead (when working with ++ * fully cached data). But in reality, due to having a separate pipeline and ++ * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can ++ * execute simultaneously with NEON and be completely shadowed by it. Thus ++ * we get no performance overhead at all (*). This looks like a very nice ++ * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher, ++ * but still can implement some rather advanced prefetch logic in software ++ * for almost zero cost! ++ * ++ * (*) The overhead of the prefetcher is visible when running some trivial ++ * pixels processing like simple copy. Anyway, having prefetch is a must ++ * when working with the graphics data. ++ */ ++.macro PF a, x:vararg ++.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED) ++ a x ++.endif ++.endm ++ ++.macro cache_preload std_increment, boost_increment ++.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0) ++.if std_increment != 0 ++ PF add PF_X, PF_X, #std_increment ++.endif ++ PF tst PF_CTL, #0xF ++ PF beq 71f ++ PF add PF_X, PF_X, #boost_increment ++ PF sub PF_CTL, PF_CTL, #1 ++71: ++ PF cmp PF_X, ORIG_W ++.if src_bpp_shift >= 0 ++ PF lsl DUMMY, PF_X, #src_bpp_shift ++ PF prfm PREFETCH_MODE, [PF_SRC, DUMMY] ++.endif ++.if dst_r_bpp != 0 ++ PF lsl DUMMY, PF_X, #dst_bpp_shift ++ PF prfm PREFETCH_MODE, [PF_DST, DUMMY] ++.endif ++.if mask_bpp_shift >= 0 ++ PF lsl DUMMY, PF_X, #mask_bpp_shift ++ PF prfm PREFETCH_MODE, [PF_MASK, DUMMY] ++.endif ++ PF ble 71f ++ PF sub PF_X, PF_X, ORIG_W ++ PF subs PF_CTL, PF_CTL, #0x10 ++71: ++ PF ble 72f ++.if src_bpp_shift >= 0 ++ PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF ldrsb DUMMY, [PF_SRC, DUMMY] ++ PF add PF_SRC, PF_SRC, #1 ++.endif ++.if dst_r_bpp != 0 ++ PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF ldrsb DUMMY, [PF_DST, DUMMY] ++ PF add PF_DST, PF_DST, #1 ++.endif ++.if mask_bpp_shift >= 0 ++ PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift ++ PF ldrsb DUMMY, [PF_MASK, DUMMY] ++ PF add PF_MASK, PF_MASK, #1 ++.endif ++72: ++.endif ++.endm ++ ++.macro cache_preload_simple ++.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE) ++.if src_bpp > 0 ++ prfm PREFETCH_MODE, [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)] ++.endif ++.if dst_r_bpp > 0 ++ prfm PREFETCH_MODE, [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)] ++.endif ++.if mask_bpp > 0 ++ prfm PREFETCH_MODE, [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)] ++.endif ++.endif ++.endm ++ ++.macro fetch_mask_pixblock ++ pixld pixblock_size, mask_bpp, \ ++ (mask_basereg - pixblock_size * mask_bpp / 64), MASK ++.endm ++ ++/* ++ * Macro which is used to process leading pixels until destination ++ * pointer is properly aligned (at 16 bytes boundary). When destination ++ * buffer uses 16bpp format, this is unnecessary, or even pointless. ++ */ ++.macro ensure_destination_ptr_alignment process_pixblock_head, \ ++ process_pixblock_tail, \ ++ process_pixblock_tail_head ++.if dst_w_bpp != 24 ++ tst DST_R, #0xF ++ beq 52f ++ ++.if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0 ++.irp lowbit, 1, 2, 4, 8, 16 ++local skip1 ++.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) ++.if lowbit < 16 /* we don't need more than 16-byte alignment */ ++ tst DST_R, #lowbit ++ beq 51f ++.endif ++ pixld_src (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC ++ pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK ++.if dst_r_bpp > 0 ++ pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R ++.else ++ add DST_R, DST_R, #lowbit ++.endif ++ PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) ++ sub W, W, #(lowbit * 8 / dst_w_bpp) ++51: ++.endif ++.endr ++.endif ++ pixdeinterleave src_bpp, src_basereg ++ pixdeinterleave mask_bpp, mask_basereg ++ pixdeinterleave dst_r_bpp, dst_r_basereg ++ ++ process_pixblock_head ++ cache_preload 0, pixblock_size ++ cache_preload_simple ++ process_pixblock_tail ++ ++ pixinterleave dst_w_bpp, dst_w_basereg ++ ++.irp lowbit, 1, 2, 4, 8, 16 ++.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) ++.if lowbit < 16 /* we don't need more than 16-byte alignment */ ++ tst DST_W, #lowbit ++ beq 51f ++.endif ++.if src_bpp == 0 && mask_bpp == 0 && dst_r_bpp == 0 ++ sub W, W, #(lowbit * 8 / dst_w_bpp) ++.endif ++ pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W ++51: ++.endif ++.endr ++.endif ++52: ++.endm ++ ++/* ++ * Special code for processing up to (pixblock_size - 1) remaining ++ * trailing pixels. As SIMD processing performs operation on ++ * pixblock_size pixels, anything smaller than this has to be loaded ++ * and stored in a special way. Loading and storing of pixel data is ++ * performed in such a way that we fill some 'slots' in the NEON ++ * registers (some slots naturally are unused), then perform compositing ++ * operation as usual. In the end, the data is taken from these 'slots' ++ * and saved to memory. ++ * ++ * cache_preload_flag - allows to suppress prefetch if ++ * set to 0 ++ * dst_aligned_flag - selects whether destination buffer ++ * is aligned ++ */ ++.macro process_trailing_pixels cache_preload_flag, \ ++ dst_aligned_flag, \ ++ process_pixblock_head, \ ++ process_pixblock_tail, \ ++ process_pixblock_tail_head ++ tst W, #(pixblock_size - 1) ++ beq 52f ++.if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0 ++.irp chunk_size, 16, 8, 4, 2, 1 ++.if pixblock_size > chunk_size ++ tst W, #chunk_size ++ beq 51f ++ pixld_src chunk_size, src_bpp, src_basereg, SRC ++ pixld chunk_size, mask_bpp, mask_basereg, MASK ++.if dst_aligned_flag != 0 ++ pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R ++.else ++ pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R ++.endif ++.if cache_preload_flag != 0 ++ PF add PF_X, PF_X, #chunk_size ++.endif ++51: ++.endif ++.endr ++.endif ++ pixdeinterleave src_bpp, src_basereg ++ pixdeinterleave mask_bpp, mask_basereg ++ pixdeinterleave dst_r_bpp, dst_r_basereg ++ ++ process_pixblock_head ++.if cache_preload_flag != 0 ++ cache_preload 0, pixblock_size ++ cache_preload_simple ++.endif ++ process_pixblock_tail ++ pixinterleave dst_w_bpp, dst_w_basereg ++.irp chunk_size, 16, 8, 4, 2, 1 ++.if pixblock_size > chunk_size ++ tst W, #chunk_size ++ beq 51f ++.if dst_aligned_flag != 0 ++ pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W ++.else ++ pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W ++.endif ++51: ++.endif ++.endr ++52: ++.endm ++ ++/* ++ * Macro, which performs all the needed operations to switch to the next ++ * scanline and start the next loop iteration unless all the scanlines ++ * are already processed. ++ */ ++.macro advance_to_next_scanline start_of_loop_label ++ mov W, ORIG_W ++ add DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift ++.if src_bpp != 0 ++ add SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift ++.endif ++.if mask_bpp != 0 ++ add MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift ++.endif ++.if (dst_w_bpp != 24) ++ sub DST_W, DST_W, W, lsl #dst_bpp_shift ++.endif ++.if (src_bpp != 24) && (src_bpp != 0) ++ sub SRC, SRC, W, lsl #src_bpp_shift ++.endif ++.if (mask_bpp != 24) && (mask_bpp != 0) ++ sub MASK, MASK, W, lsl #mask_bpp_shift ++.endif ++ subs H, H, #1 ++ mov DST_R, DST_W ++ bge start_of_loop_label ++.endm ++ ++/* ++ * Registers are allocated in the following way by default: ++ * v0, v1, v2, v3 - reserved for loading source pixel data ++ * v4, v5, v6, v7 - reserved for loading destination pixel data ++ * v24, v25, v26, v27 - reserved for loading mask pixel data ++ * v28, v29, v30, v31 - final destination pixel data for writeback to memory ++ */ ++.macro generate_composite_function fname, \ ++ src_bpp_, \ ++ mask_bpp_, \ ++ dst_w_bpp_, \ ++ flags, \ ++ pixblock_size_, \ ++ prefetch_distance, \ ++ init, \ ++ cleanup, \ ++ process_pixblock_head, \ ++ process_pixblock_tail, \ ++ process_pixblock_tail_head, \ ++ dst_w_basereg_ = 28, \ ++ dst_r_basereg_ = 4, \ ++ src_basereg_ = 0, \ ++ mask_basereg_ = 24 ++ ++ pixman_asm_function fname ++ stp x29, x30, [sp, -16]! ++ mov x29, sp ++ sub sp, sp, 232 /* push all registers */ ++ sub x29, x29, 64 ++ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32 ++ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32 ++ stp x8, x9, [x29, -80] ++ stp x10, x11, [x29, -96] ++ stp x12, x13, [x29, -112] ++ stp x14, x15, [x29, -128] ++ stp x16, x17, [x29, -144] ++ stp x18, x19, [x29, -160] ++ stp x20, x21, [x29, -176] ++ stp x22, x23, [x29, -192] ++ stp x24, x25, [x29, -208] ++ stp x26, x27, [x29, -224] ++ str x28, [x29, -232] ++ ++/* ++ * Select prefetch type for this function. If prefetch distance is ++ * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch ++ * has to be used instead of ADVANCED. ++ */ ++ .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT ++.if prefetch_distance == 0 ++ .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE ++.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \ ++ ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24)) ++ .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE ++.endif ++ ++/* ++ * Make some macro arguments globally visible and accessible ++ * from other macros ++ */ ++ .set src_bpp, src_bpp_ ++ .set mask_bpp, mask_bpp_ ++ .set dst_w_bpp, dst_w_bpp_ ++ .set pixblock_size, pixblock_size_ ++ .set dst_w_basereg, dst_w_basereg_ ++ .set dst_r_basereg, dst_r_basereg_ ++ .set src_basereg, src_basereg_ ++ .set mask_basereg, mask_basereg_ ++ ++ .macro pixld_src x:vararg ++ pixld x ++ .endm ++ .macro fetch_src_pixblock ++ pixld_src pixblock_size, src_bpp, \ ++ (src_basereg - pixblock_size * src_bpp / 64), SRC ++ .endm ++/* ++ * Assign symbolic names to registers ++ */ ++ W .req x0 /* width (is updated during processing) */ ++ H .req x1 /* height (is updated during processing) */ ++ DST_W .req x2 /* destination buffer pointer for writes */ ++ DST_STRIDE .req x3 /* destination image stride */ ++ SRC .req x4 /* source buffer pointer */ ++ SRC_STRIDE .req x5 /* source image stride */ ++ MASK .req x6 /* mask pointer */ ++ MASK_STRIDE .req x7 /* mask stride */ ++ ++ DST_R .req x8 /* destination buffer pointer for reads */ ++ ++ PF_CTL .req x9 /* combined lines counter and prefetch */ ++ /* distance increment counter */ ++ PF_X .req x10 /* pixel index in a scanline for current */ ++ /* pretetch position */ ++ PF_SRC .req x11 /* pointer to source scanline start */ ++ /* for prefetch purposes */ ++ PF_DST .req x12 /* pointer to destination scanline start */ ++ /* for prefetch purposes */ ++ PF_MASK .req x13 /* pointer to mask scanline start */ ++ /* for prefetch purposes */ ++ ++ ORIG_W .req x14 /* saved original width */ ++ DUMMY .req x15 /* temporary register */ ++ ++ sxtw x0, w0 ++ sxtw x1, w1 ++ sxtw x3, w3 ++ sxtw x5, w5 ++ sxtw x7, w7 ++ ++ .set mask_bpp_shift, -1 ++.if src_bpp == 32 ++ .set src_bpp_shift, 2 ++.elseif src_bpp == 24 ++ .set src_bpp_shift, 0 ++.elseif src_bpp == 16 ++ .set src_bpp_shift, 1 ++.elseif src_bpp == 8 ++ .set src_bpp_shift, 0 ++.elseif src_bpp == 0 ++ .set src_bpp_shift, -1 ++.else ++ .error "requested src bpp (src_bpp) is not supported" ++.endif ++.if mask_bpp == 32 ++ .set mask_bpp_shift, 2 ++.elseif mask_bpp == 24 ++ .set mask_bpp_shift, 0 ++.elseif mask_bpp == 8 ++ .set mask_bpp_shift, 0 ++.elseif mask_bpp == 0 ++ .set mask_bpp_shift, -1 ++.else ++ .error "requested mask bpp (mask_bpp) is not supported" ++.endif ++.if dst_w_bpp == 32 ++ .set dst_bpp_shift, 2 ++.elseif dst_w_bpp == 24 ++ .set dst_bpp_shift, 0 ++.elseif dst_w_bpp == 16 ++ .set dst_bpp_shift, 1 ++.elseif dst_w_bpp == 8 ++ .set dst_bpp_shift, 0 ++.else ++ .error "requested dst bpp (dst_w_bpp) is not supported" ++.endif ++ ++.if (((flags) & FLAG_DST_READWRITE) != 0) ++ .set dst_r_bpp, dst_w_bpp ++.else ++ .set dst_r_bpp, 0 ++.endif ++.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) ++ .set DEINTERLEAVE_32BPP_ENABLED, 1 ++.else ++ .set DEINTERLEAVE_32BPP_ENABLED, 0 ++.endif ++ ++.if prefetch_distance < 0 || prefetch_distance > 15 ++ .error "invalid prefetch distance (prefetch_distance)" ++.endif ++ ++ PF mov PF_X, #0 ++ mov DST_R, DST_W ++ ++.if src_bpp == 24 ++ sub SRC_STRIDE, SRC_STRIDE, W ++ sub SRC_STRIDE, SRC_STRIDE, W, lsl #1 ++.endif ++.if mask_bpp == 24 ++ sub MASK_STRIDE, MASK_STRIDE, W ++ sub MASK_STRIDE, MASK_STRIDE, W, lsl #1 ++.endif ++.if dst_w_bpp == 24 ++ sub DST_STRIDE, DST_STRIDE, W ++ sub DST_STRIDE, DST_STRIDE, W, lsl #1 ++.endif ++ ++/* ++ * Setup advanced prefetcher initial state ++ */ ++ PF mov PF_SRC, SRC ++ PF mov PF_DST, DST_R ++ PF mov PF_MASK, MASK ++ /* PF_CTL = prefetch_distance | ((h - 1) << 4) */ ++ PF lsl DUMMY, H, #4 ++ PF mov PF_CTL, DUMMY ++ PF add PF_CTL, PF_CTL, #(prefetch_distance - 0x10) ++ ++ init ++ subs H, H, #1 ++ mov ORIG_W, W ++ blt 9f ++ cmp W, #(pixblock_size * 2) ++ blt 800f ++/* ++ * This is the start of the pipelined loop, which if optimized for ++ * long scanlines ++ */ ++0: ++ ensure_destination_ptr_alignment process_pixblock_head, \ ++ process_pixblock_tail, \ ++ process_pixblock_tail_head ++ ++ /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ ++ pixld_a pixblock_size, dst_r_bpp, \ ++ (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R ++ fetch_src_pixblock ++ pixld pixblock_size, mask_bpp, \ ++ (mask_basereg - pixblock_size * mask_bpp / 64), MASK ++ PF add PF_X, PF_X, #pixblock_size ++ process_pixblock_head ++ cache_preload 0, pixblock_size ++ cache_preload_simple ++ subs W, W, #(pixblock_size * 2) ++ blt 200f ++ ++100: ++ process_pixblock_tail_head ++ cache_preload_simple ++ subs W, W, #pixblock_size ++ bge 100b ++ ++200: ++ process_pixblock_tail ++ pixst_a pixblock_size, dst_w_bpp, \ ++ (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W ++ ++ /* Process the remaining trailing pixels in the scanline */ ++ process_trailing_pixels 1, 1, \ ++ process_pixblock_head, \ ++ process_pixblock_tail, \ ++ process_pixblock_tail_head ++ advance_to_next_scanline 0b ++ ++ cleanup ++1000: ++ /* pop all registers */ ++ sub x29, x29, 64 ++ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 ++ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 ++ ldp x8, x9, [x29, -80] ++ ldp x10, x11, [x29, -96] ++ ldp x12, x13, [x29, -112] ++ ldp x14, x15, [x29, -128] ++ ldp x16, x17, [x29, -144] ++ ldp x18, x19, [x29, -160] ++ ldp x20, x21, [x29, -176] ++ ldp x22, x23, [x29, -192] ++ ldp x24, x25, [x29, -208] ++ ldp x26, x27, [x29, -224] ++ ldr x28, [x29, -232] ++ mov sp, x29 ++ ldp x29, x30, [sp], 16 ++ ret /* exit */ ++/* ++ * This is the start of the loop, designed to process images with small width ++ * (less than pixblock_size * 2 pixels). In this case neither pipelining ++ * nor prefetch are used. ++ */ ++800: ++.if src_bpp_shift >= 0 ++ PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift ++ PF prfm PREFETCH_MODE, [SRC, DUMMY] ++.endif ++.if dst_r_bpp != 0 ++ PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift ++ PF prfm PREFETCH_MODE, [DST_R, DUMMY] ++.endif ++.if mask_bpp_shift >= 0 ++ PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift ++ PF prfm PREFETCH_MODE, [MASK, DUMMY] ++.endif ++ /* Process exactly pixblock_size pixels if needed */ ++ tst W, #pixblock_size ++ beq 100f ++ pixld pixblock_size, dst_r_bpp, \ ++ (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R ++ fetch_src_pixblock ++ pixld pixblock_size, mask_bpp, \ ++ (mask_basereg - pixblock_size * mask_bpp / 64), MASK ++ process_pixblock_head ++ process_pixblock_tail ++ pixst pixblock_size, dst_w_bpp, \ ++ (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W ++100: ++ /* Process the remaining trailing pixels in the scanline */ ++ process_trailing_pixels 0, 0, \ ++ process_pixblock_head, \ ++ process_pixblock_tail, \ ++ process_pixblock_tail_head ++ advance_to_next_scanline 800b ++9: ++ cleanup ++ /* pop all registers */ ++ sub x29, x29, 64 ++ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 ++ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 ++ ldp x8, x9, [x29, -80] ++ ldp x10, x11, [x29, -96] ++ ldp x12, x13, [x29, -112] ++ ldp x14, x15, [x29, -128] ++ ldp x16, x17, [x29, -144] ++ ldp x18, x19, [x29, -160] ++ ldp x20, x21, [x29, -176] ++ ldp x22, x23, [x29, -192] ++ ldp x24, x25, [x29, -208] ++ ldp x26, x27, [x29, -224] ++ ldr x28, [x29, -232] ++ mov sp, x29 ++ ldp x29, x30, [sp], 16 ++ ret /* exit */ ++ ++ .purgem fetch_src_pixblock ++ .purgem pixld_src ++ ++ .unreq SRC ++ .unreq MASK ++ .unreq DST_R ++ .unreq DST_W ++ .unreq ORIG_W ++ .unreq W ++ .unreq H ++ .unreq SRC_STRIDE ++ .unreq DST_STRIDE ++ .unreq MASK_STRIDE ++ .unreq PF_CTL ++ .unreq PF_X ++ .unreq PF_SRC ++ .unreq PF_DST ++ .unreq PF_MASK ++ .unreq DUMMY ++ .endfunc ++.endm ++ ++/* ++ * A simplified variant of function generation template for a single ++ * scanline processing (for implementing pixman combine functions) ++ */ ++.macro generate_composite_function_scanline use_nearest_scaling, \ ++ fname, \ ++ src_bpp_, \ ++ mask_bpp_, \ ++ dst_w_bpp_, \ ++ flags, \ ++ pixblock_size_, \ ++ init, \ ++ cleanup, \ ++ process_pixblock_head, \ ++ process_pixblock_tail, \ ++ process_pixblock_tail_head, \ ++ dst_w_basereg_ = 28, \ ++ dst_r_basereg_ = 4, \ ++ src_basereg_ = 0, \ ++ mask_basereg_ = 24 ++ ++ pixman_asm_function fname ++ .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE ++ ++/* ++ * Make some macro arguments globally visible and accessible ++ * from other macros ++ */ ++ .set src_bpp, src_bpp_ ++ .set mask_bpp, mask_bpp_ ++ .set dst_w_bpp, dst_w_bpp_ ++ .set pixblock_size, pixblock_size_ ++ .set dst_w_basereg, dst_w_basereg_ ++ .set dst_r_basereg, dst_r_basereg_ ++ .set src_basereg, src_basereg_ ++ .set mask_basereg, mask_basereg_ ++ ++.if use_nearest_scaling != 0 ++ /* ++ * Assign symbolic names to registers for nearest scaling ++ */ ++ W .req x0 ++ DST_W .req x1 ++ SRC .req x2 ++ VX .req x3 ++ UNIT_X .req x4 ++ SRC_WIDTH_FIXED .req x5 ++ MASK .req x6 ++ TMP1 .req x8 ++ TMP2 .req x9 ++ DST_R .req x10 ++ DUMMY .req x30 ++ ++ .macro pixld_src x:vararg ++ pixld_s x ++ .endm ++ ++ sxtw x0, w0 ++ sxtw x3, w3 ++ sxtw x4, w4 ++ sxtw x5, w5 ++ ++ stp x29, x30, [sp, -16]! ++ mov x29, sp ++ sub sp, sp, 88 ++ sub x29, x29, 64 ++ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 ++ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 ++ stp x8, x9, [x29, -80] ++ str x10, [x29, -88] ++.else ++ /* ++ * Assign symbolic names to registers ++ */ ++ W .req x0 /* width (is updated during processing) */ ++ DST_W .req x1 /* destination buffer pointer for writes */ ++ SRC .req x2 /* source buffer pointer */ ++ MASK .req x3 /* mask pointer */ ++ DST_R .req x4 /* destination buffer pointer for reads */ ++ DUMMY .req x30 ++ ++ .macro pixld_src x:vararg ++ pixld x ++ .endm ++ ++ sxtw x0, w0 ++ ++ stp x29, x30, [sp, -16]! ++ mov x29, sp ++ sub sp, sp, 64 ++ sub x29, x29, 64 ++ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 ++ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 ++.endif ++ ++.if (((flags) & FLAG_DST_READWRITE) != 0) ++ .set dst_r_bpp, dst_w_bpp ++.else ++ .set dst_r_bpp, 0 ++.endif ++.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) ++ .set DEINTERLEAVE_32BPP_ENABLED, 1 ++.else ++ .set DEINTERLEAVE_32BPP_ENABLED, 0 ++.endif ++ ++ .macro fetch_src_pixblock ++ pixld_src pixblock_size, src_bpp, \ ++ (src_basereg - pixblock_size * src_bpp / 64), SRC ++ .endm ++ ++ init ++ mov DST_R, DST_W ++ ++ cmp W, #pixblock_size ++ blt 800f ++ ++ ensure_destination_ptr_alignment process_pixblock_head, \ ++ process_pixblock_tail, \ ++ process_pixblock_tail_head ++ ++ subs W, W, #pixblock_size ++ blt 700f ++ ++ /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ ++ pixld_a pixblock_size, dst_r_bpp, \ ++ (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R ++ fetch_src_pixblock ++ pixld pixblock_size, mask_bpp, \ ++ (mask_basereg - pixblock_size * mask_bpp / 64), MASK ++ process_pixblock_head ++ subs W, W, #pixblock_size ++ blt 200f ++100: ++ process_pixblock_tail_head ++ subs W, W, #pixblock_size ++ bge 100b ++200: ++ process_pixblock_tail ++ pixst_a pixblock_size, dst_w_bpp, \ ++ (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W ++700: ++ /* Process the remaining trailing pixels in the scanline (dst aligned) */ ++ process_trailing_pixels 0, 1, \ ++ process_pixblock_head, \ ++ process_pixblock_tail, \ ++ process_pixblock_tail_head ++ ++ cleanup ++.if use_nearest_scaling != 0 ++ sub x29, x29, 64 ++ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 ++ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 ++ ldp x8, x9, [x29, -80] ++ ldr x10, [x29, -96] ++ mov sp, x29 ++ ldp x29, x30, [sp], 16 ++ ret /* exit */ ++.else ++ sub x29, x29, 64 ++ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 ++ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 ++ mov sp, x29 ++ ldp x29, x30, [sp], 16 ++ ret /* exit */ ++.endif ++800: ++ /* Process the remaining trailing pixels in the scanline (dst unaligned) */ ++ process_trailing_pixels 0, 0, \ ++ process_pixblock_head, \ ++ process_pixblock_tail, \ ++ process_pixblock_tail_head ++ ++ cleanup ++.if use_nearest_scaling != 0 ++ sub x29, x29, 64 ++ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 ++ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 ++ ldp x8, x9, [x29, -80] ++ ldr x10, [x29, -88] ++ mov sp, x29 ++ ldp x29, x30, [sp], 16 ++ ret /* exit */ ++ ++ .unreq DUMMY ++ .unreq DST_R ++ .unreq SRC ++ .unreq W ++ .unreq VX ++ .unreq UNIT_X ++ .unreq TMP1 ++ .unreq TMP2 ++ .unreq DST_W ++ .unreq MASK ++ .unreq SRC_WIDTH_FIXED ++ ++.else ++ sub x29, x29, 64 ++ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32 ++ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32 ++ mov sp, x29 ++ ldp x29, x30, [sp], 16 ++ ret /* exit */ ++ ++ .unreq DUMMY ++ .unreq SRC ++ .unreq MASK ++ .unreq DST_R ++ .unreq DST_W ++ .unreq W ++.endif ++ ++ .purgem fetch_src_pixblock ++ .purgem pixld_src ++ ++ .endfunc ++.endm ++ ++.macro generate_composite_function_single_scanline x:vararg ++ generate_composite_function_scanline 0, x ++.endm ++ ++.macro generate_composite_function_nearest_scanline x:vararg ++ generate_composite_function_scanline 1, x ++.endm ++ ++/* Default prologue/epilogue, nothing special needs to be done */ ++ ++.macro default_init ++.endm ++ ++.macro default_cleanup ++.endm ++ ++/* ++ * Prologue/epilogue variant which additionally saves/restores v8-v15 ++ * registers (they need to be saved/restored by callee according to ABI). ++ * This is required if the code needs to use all the NEON registers. ++ */ ++ ++.macro default_init_need_all_regs ++.endm ++ ++.macro default_cleanup_need_all_regs ++.endm ++ ++/******************************************************************************/ ++ ++/* ++ * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in) ++ * into a planar a8r8g8b8 format (with a, r, g, b color components ++ * stored into 64-bit registers out_a, out_r, out_g, out_b respectively). ++ * ++ * Warning: the conversion is destructive and the original ++ * value (in) is lost. ++ */ ++.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b ++ shrn &out_r&.8b, &in&.8h, #8 ++ shrn &out_g&.8b, &in&.8h, #3 ++ sli &in&.8h, &in&.8h, #5 ++ movi &out_a&.8b, #255 ++ sri &out_r&.8b, &out_r&.8b, #5 ++ sri &out_g&.8b, &out_g&.8b, #6 ++ shrn &out_b&.8b, &in&.8h, #2 ++.endm ++ ++.macro convert_0565_to_x888 in, out_r, out_g, out_b ++ shrn &out_r&.8b, &in&.8h, #8 ++ shrn &out_g&.8b, &in&.8h, #3 ++ sli &in&.8h, &in&.8h, #5 ++ sri &out_r&.8b, &out_r&.8b, #5 ++ sri &out_g&.8b, &out_g&.8b, #6 ++ shrn &out_b&.8b, &in&.8h, #2 ++.endm ++ ++/* ++ * Conversion from planar a8r8g8b8 format (with a, r, g, b color components ++ * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6 ++ * pixels packed in 128-bit register (out). Requires two temporary 128-bit ++ * registers (tmp1, tmp2) ++ */ ++.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2 ++ ushll &tmp1&.8h, &in_g&.8b, #7 ++ shl &tmp1&.8h, &tmp1&.8h, #1 ++ ushll &out&.8h, &in_r&.8b, #7 ++ shl &out&.8h, &out&.8h, #1 ++ ushll &tmp2&.8h, &in_b&.8b, #7 ++ shl &tmp2&.8h, &tmp2&.8h, #1 ++ sri &out&.8h, &tmp1&.8h, #5 ++ sri &out&.8h, &tmp2&.8h, #11 ++.endm ++ ++/* ++ * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels ++ * returned in (out0, out1) registers pair. Requires one temporary ++ * 64-bit register (tmp). 'out1' and 'in' may overlap, the original ++ * value from 'in' is lost ++ */ ++.macro convert_four_0565_to_x888_packed in, out0, out1, tmp ++ shl &out0&.4h, &in&.4h, #5 /* G top 6 bits */ ++ shl &tmp&.4h, &in&.4h, #11 /* B top 5 bits */ ++ sri &in&.4h, &in&.4h, #5 /* R is ready in top bits */ ++ sri &out0&.4h, &out0&.4h, #6 /* G is ready in top bits */ ++ sri &tmp&.4h, &tmp&.4h, #5 /* B is ready in top bits */ ++ ushr &out1&.4h, &in&.4h, #8 /* R is in place */ ++ sri &out0&.4h, &tmp&.4h, #8 /* G & B is in place */ ++ zip1 &tmp&.4h, &out0&.4h, &out1&.4h /* everything is in place */ ++ zip2 &out1&.4h, &out0&.4h, &out1&.4h ++ mov &out0&.d[0], &tmp&.d[0] ++.endm +diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h +index 73a5414..22c8ccc +--- a/pixman/pixman-private.h ++++ b/pixman/pixman-private.h +@@ -607,6 +607,11 @@ pixman_implementation_t * + _pixman_implementation_create_arm_neon (pixman_implementation_t *fallback); + #endif + ++#ifdef USE_ARM_A64_NEON ++pixman_implementation_t * ++_pixman_implementation_create_arm_neon (pixman_implementation_t *fallback); ++#endif ++ + #ifdef USE_MIPS_DSPR2 + pixman_implementation_t * + _pixman_implementation_create_mips_dspr2 (pixman_implementation_t *fallback); +-- +2.8.0 + diff --git a/SDK/board/funkey/patches/sdl/0001-pixel-alpha-multiply.patch b/SDK/board/funkey/patches/sdl/0001-pixel-alpha-multiply.patch new file mode 100644 index 0000000..29e6af0 --- /dev/null +++ b/SDK/board/funkey/patches/sdl/0001-pixel-alpha-multiply.patch @@ -0,0 +1,295 @@ + SDL_blit_A.c | 270 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 270 insertions(+) + +diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c +index 219cdcc..d8e921e 100644 +--- a/src/video/SDL_blit_A.c ++++ b/src/video/SDL_blit_A.c +@@ -27,3 +34,270 @@ ++ ++/*! ++\brief Alpha adjustment table for custom blitter. ++ ++The table provides values for a modified, non-linear ++transfer function which maintain brightness. ++ ++*/ ++const unsigned int GFX_ALPHA_ADJUST_ARRAY[256] = { ++ 0, /* 0 */ ++ 15, /* 1 */ ++ 22, /* 2 */ ++ 27, /* 3 */ ++ 31, /* 4 */ ++ 35, /* 5 */ ++ 39, /* 6 */ ++ 42, /* 7 */ ++ 45, /* 8 */ ++ 47, /* 9 */ ++ 50, /* 10 */ ++ 52, /* 11 */ ++ 55, /* 12 */ ++ 57, /* 13 */ ++ 59, /* 14 */ ++ 61, /* 15 */ ++ 63, /* 16 */ ++ 65, /* 17 */ ++ 67, /* 18 */ ++ 69, /* 19 */ ++ 71, /* 20 */ ++ 73, /* 21 */ ++ 74, /* 22 */ ++ 76, /* 23 */ ++ 78, /* 24 */ ++ 79, /* 25 */ ++ 81, /* 26 */ ++ 82, /* 27 */ ++ 84, /* 28 */ ++ 85, /* 29 */ ++ 87, /* 30 */ ++ 88, /* 31 */ ++ 90, /* 32 */ ++ 91, /* 33 */ ++ 93, /* 34 */ ++ 94, /* 35 */ ++ 95, /* 36 */ ++ 97, /* 37 */ ++ 98, /* 38 */ ++ 99, /* 39 */ ++ 100, /* 40 */ ++ 102, /* 41 */ ++ 103, /* 42 */ ++ 104, /* 43 */ ++ 105, /* 44 */ ++ 107, /* 45 */ ++ 108, /* 46 */ ++ 109, /* 47 */ ++ 110, /* 48 */ ++ 111, /* 49 */ ++ 112, /* 50 */ ++ 114, /* 51 */ ++ 115, /* 52 */ ++ 116, /* 53 */ ++ 117, /* 54 */ ++ 118, /* 55 */ ++ 119, /* 56 */ ++ 120, /* 57 */ ++ 121, /* 58 */ ++ 122, /* 59 */ ++ 123, /* 60 */ ++ 124, /* 61 */ ++ 125, /* 62 */ ++ 126, /* 63 */ ++ 127, /* 64 */ ++ 128, /* 65 */ ++ 129, /* 66 */ ++ 130, /* 67 */ ++ 131, /* 68 */ ++ 132, /* 69 */ ++ 133, /* 70 */ ++ 134, /* 71 */ ++ 135, /* 72 */ ++ 136, /* 73 */ ++ 137, /* 74 */ ++ 138, /* 75 */ ++ 139, /* 76 */ ++ 140, /* 77 */ ++ 141, /* 78 */ ++ 141, /* 79 */ ++ 142, /* 80 */ ++ 143, /* 81 */ ++ 144, /* 82 */ ++ 145, /* 83 */ ++ 146, /* 84 */ ++ 147, /* 85 */ ++ 148, /* 86 */ ++ 148, /* 87 */ ++ 149, /* 88 */ ++ 150, /* 89 */ ++ 151, /* 90 */ ++ 152, /* 91 */ ++ 153, /* 92 */ ++ 153, /* 93 */ ++ 154, /* 94 */ ++ 155, /* 95 */ ++ 156, /* 96 */ ++ 157, /* 97 */ ++ 158, /* 98 */ ++ 158, /* 99 */ ++ 159, /* 100 */ ++ 160, /* 101 */ ++ 161, /* 102 */ ++ 162, /* 103 */ ++ 162, /* 104 */ ++ 163, /* 105 */ ++ 164, /* 106 */ ++ 165, /* 107 */ ++ 165, /* 108 */ ++ 166, /* 109 */ ++ 167, /* 110 */ ++ 168, /* 111 */ ++ 168, /* 112 */ ++ 169, /* 113 */ ++ 170, /* 114 */ ++ 171, /* 115 */ ++ 171, /* 116 */ ++ 172, /* 117 */ ++ 173, /* 118 */ ++ 174, /* 119 */ ++ 174, /* 120 */ ++ 175, /* 121 */ ++ 176, /* 122 */ ++ 177, /* 123 */ ++ 177, /* 124 */ ++ 178, /* 125 */ ++ 179, /* 126 */ ++ 179, /* 127 */ ++ 180, /* 128 */ ++ 181, /* 129 */ ++ 182, /* 130 */ ++ 182, /* 131 */ ++ 183, /* 132 */ ++ 184, /* 133 */ ++ 184, /* 134 */ ++ 185, /* 135 */ ++ 186, /* 136 */ ++ 186, /* 137 */ ++ 187, /* 138 */ ++ 188, /* 139 */ ++ 188, /* 140 */ ++ 189, /* 141 */ ++ 190, /* 142 */ ++ 190, /* 143 */ ++ 191, /* 144 */ ++ 192, /* 145 */ ++ 192, /* 146 */ ++ 193, /* 147 */ ++ 194, /* 148 */ ++ 194, /* 149 */ ++ 195, /* 150 */ ++ 196, /* 151 */ ++ 196, /* 152 */ ++ 197, /* 153 */ ++ 198, /* 154 */ ++ 198, /* 155 */ ++ 199, /* 156 */ ++ 200, /* 157 */ ++ 200, /* 158 */ ++ 201, /* 159 */ ++ 201, /* 160 */ ++ 202, /* 161 */ ++ 203, /* 162 */ ++ 203, /* 163 */ ++ 204, /* 164 */ ++ 205, /* 165 */ ++ 205, /* 166 */ ++ 206, /* 167 */ ++ 206, /* 168 */ ++ 207, /* 169 */ ++ 208, /* 170 */ ++ 208, /* 171 */ ++ 209, /* 172 */ ++ 210, /* 173 */ ++ 210, /* 174 */ ++ 211, /* 175 */ ++ 211, /* 176 */ ++ 212, /* 177 */ ++ 213, /* 178 */ ++ 213, /* 179 */ ++ 214, /* 180 */ ++ 214, /* 181 */ ++ 215, /* 182 */ ++ 216, /* 183 */ ++ 216, /* 184 */ ++ 217, /* 185 */ ++ 217, /* 186 */ ++ 218, /* 187 */ ++ 218, /* 188 */ ++ 219, /* 189 */ ++ 220, /* 190 */ ++ 220, /* 191 */ ++ 221, /* 192 */ ++ 221, /* 193 */ ++ 222, /* 194 */ ++ 222, /* 195 */ ++ 223, /* 196 */ ++ 224, /* 197 */ ++ 224, /* 198 */ ++ 225, /* 199 */ ++ 225, /* 200 */ ++ 226, /* 201 */ ++ 226, /* 202 */ ++ 227, /* 203 */ ++ 228, /* 204 */ ++ 228, /* 205 */ ++ 229, /* 206 */ ++ 229, /* 207 */ ++ 230, /* 208 */ ++ 230, /* 209 */ ++ 231, /* 210 */ ++ 231, /* 211 */ ++ 232, /* 212 */ ++ 233, /* 213 */ ++ 233, /* 214 */ ++ 234, /* 215 */ ++ 234, /* 216 */ ++ 235, /* 217 */ ++ 235, /* 218 */ ++ 236, /* 219 */ ++ 236, /* 220 */ ++ 237, /* 221 */ ++ 237, /* 222 */ ++ 238, /* 223 */ ++ 238, /* 224 */ ++ 239, /* 225 */ ++ 240, /* 226 */ ++ 240, /* 227 */ ++ 241, /* 228 */ ++ 241, /* 229 */ ++ 242, /* 230 */ ++ 242, /* 231 */ ++ 243, /* 232 */ ++ 243, /* 233 */ ++ 244, /* 234 */ ++ 244, /* 235 */ ++ 245, /* 236 */ ++ 245, /* 237 */ ++ 246, /* 238 */ ++ 246, /* 239 */ ++ 247, /* 240 */ ++ 247, /* 241 */ ++ 248, /* 242 */ ++ 248, /* 243 */ ++ 249, /* 244 */ ++ 249, /* 245 */ ++ 250, /* 246 */ ++ 250, /* 247 */ ++ 251, /* 248 */ ++ 251, /* 249 */ ++ 252, /* 250 */ ++ 252, /* 251 */ ++ 253, /* 252 */ ++ 253, /* 253 */ ++ 254, /* 254 */ ++ 255 /* 255 */ ++}; ++ + /* + In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on. + Checking if _mm_free is #defined in malloc.h is is the only way to +@@ -2679,6 +2985,7 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info) + int dstskip = info->d_skip; + SDL_PixelFormat *srcfmt = info->src; + SDL_PixelFormat *dstfmt = info->dst; ++ uint8_t alpha_multiply = srcfmt->alpha; + + int srcbpp; + int dstbpp; +@@ -2705,6 +3012,8 @@ static void BlitNtoNPixelAlpha(SDL_BlitInfo *info) + unsigned sA; + unsigned dA; + DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA); ++ sA=(sA*alpha_multiply)>>8; ++ sA=GFX_ALPHA_ADJUST_ARRAY[sA & 255]; + if(sA) { + DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA); + ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); diff --git a/SDK/board/funkey/patches/sdl/003-alsa-fix-excessiveio.patch b/SDK/board/funkey/patches/sdl/003-alsa-fix-excessiveio.patch new file mode 100644 index 0000000..46e9524 --- /dev/null +++ b/SDK/board/funkey/patches/sdl/003-alsa-fix-excessiveio.patch @@ -0,0 +1,26 @@ +Merge this bug as it can affect performance : +https://github.com/OpenDingux/SDL/commit/e51100dce8da9099278dac9f5affbecf6396378b + +--- a/src/audio/alsa/SDL_alsa_audio.c ++++ b/src/audio/alsa/SDL_alsa_audio.c +@@ -479,6 +479,10 @@ + return(-1); + } + ++ /* Switch to blocking mode for playback */ ++ /* Note: this must happen before hw/sw params are set. */ ++ SDL_NAME(snd_pcm_nonblock)(pcm_handle, 0); ++ + /* Figure out what the hardware is capable of */ + snd_pcm_hw_params_alloca(&hwparams); + status = SDL_NAME(snd_pcm_hw_params_any)(pcm_handle, hwparams); +@@ -611,9 +615,6 @@ + } + SDL_memset(mixbuf, spec->silence, spec->size); + +- /* Switch to blocking mode for playback */ +- SDL_NAME(snd_pcm_nonblock)(pcm_handle, 0); +- + /* We're ready to rock and roll. :-) */ + return(0); + } diff --git a/SDK/board/funkey/patches/sdl/sdl-fix-kb-input.patch b/SDK/board/funkey/patches/sdl/sdl-fix-kb-input.patch new file mode 100644 index 0000000..8f7db83 --- /dev/null +++ b/SDK/board/funkey/patches/sdl/sdl-fix-kb-input.patch @@ -0,0 +1,22 @@ +diff --git a/src/video/fbcon/SDL_fbevents.c b/src/video/fbcon/SDL_fbevents.c +index 5e369a4..549a7ad 100644 +--- a/src/video/fbcon/SDL_fbevents.c ++++ b/src/video/fbcon/SDL_fbevents.c +@@ -270,17 +270,6 @@ int FB_OpenKeyboard(_THIS) + fprintf(stderr, "vtpath = %s, fd = %d\n", + vtpath, keyboard_fd); + #endif /* DEBUG_KEYBOARD */ +- +- /* This needs to be our controlling tty +- so that the kernel ioctl() calls work +- */ +- if ( keyboard_fd >= 0 ) { +- tty0_fd = open("/dev/tty", O_RDWR, 0); +- if ( tty0_fd >= 0 ) { +- ioctl(tty0_fd, TIOCNOTTY, 0); +- close(tty0_fd); +- } +- } + } + } + if ( keyboard_fd < 0 ) { diff --git a/SDK/configs/funkey_defconfig b/SDK/configs/funkey_defconfig new file mode 100644 index 0000000..56e64a0 --- /dev/null +++ b/SDK/configs/funkey_defconfig @@ -0,0 +1,82 @@ +BR2_arm=y +BR2_cortex_a7=y +BR2_ARM_FPU_VFPV4=y +BR2_DL_DIR="../download" +BR2_CCACHE=y +BR2_OPTIMIZE_FAST=y +BR2_SHARED_STATIC_LIBS=y +BR2_GLOBAL_PATCH_DIR="$(BR2_EXTERNAL_SDK_PATH)/board/funkey/patches" +BR2_TOOLCHAIN_BUILDROOT_VENDOR="funkey" +BR2_TOOLCHAIN_BUILDROOT_MUSL=y +BR2_KERNEL_HEADERS_VERSION=y +BR2_DEFAULT_KERNEL_VERSION="4.14.14" +BR2_PACKAGE_HOST_LINUX_HEADERS_CUSTOM_4_14=y +BR2_BINUTILS_VERSION_2_35_X=y +BR2_GCC_VERSION_10_X=y +BR2_TOOLCHAIN_BUILDROOT_CXX=y +BR2_GCC_ENABLE_LTO=y +BR2_PACKAGE_HOST_GDB=y +BR2_PACKAGE_HOST_GDB_TUI=y +BR2_GDB_VERSION_9_2=y +BR2_TARGET_OPTIMIZATION="-fno-PIC -march=armv7-a+neon-vfpv4 -mtune=cortex-a7 -mfpu=neon-vfpv4 -mvectorize-with-neon-quad" +BR2_INIT_NONE=y +BR2_PACKAGE_BUSYBOX_SHOW_OTHERS=y +BR2_PACKAGE_ALSA_UTILS=y +BR2_PACKAGE_GSTREAMER1=y +BR2_PACKAGE_GST1_PLUGINS_BASE=y +BR2_PACKAGE_MPG123=y +BR2_PACKAGE_LZOP=y +BR2_PACKAGE_E2FSPROGS=y +BR2_PACKAGE_E2FSPROGS_RESIZE2FS=y +BR2_PACKAGE_SDL_GFX=y +BR2_PACKAGE_SDL_IMAGE=y +BR2_PACKAGE_SDL_IMAGE_GIF=y +BR2_PACKAGE_SDL_IMAGE_JPEG=y +BR2_PACKAGE_SDL_IMAGE_PNG=y +BR2_PACKAGE_SDL_MIXER=y +BR2_PACKAGE_SDL_NET=y +BR2_PACKAGE_SDL_SOUND=y +BR2_PACKAGE_SDL_TTF=y +BR2_PACKAGE_PARTED=y +BR2_PACKAGE_UBOOT_TOOLS=y +BR2_PACKAGE_UBOOT_TOOLS_MKIMAGE=y +BR2_PACKAGE_UBOOT_TOOLS_MKENVIMAGE=y +BR2_PACKAGE_LUA=y +BR2_PACKAGE_LIBSAMPLERATE=y +BR2_PACKAGE_LIBSNDFILE=y +BR2_PACKAGE_OPENAL=y +BR2_PACKAGE_TINYALSA=y +BR2_PACKAGE_TREMOR=y +BR2_PACKAGE_LIBARCHIVE=y +BR2_PACKAGE_LIBARCHIVE_BSDTAR=y +BR2_PACKAGE_LIBARCHIVE_BSDCPIO=y +BR2_PACKAGE_LIBARCHIVE_BSDCAT=y +BR2_PACKAGE_SQLITE=y +BR2_PACKAGE_LIBCONFIG=y +BR2_PACKAGE_GIFLIB=y +BR2_PACKAGE_LIBQRENCODE=y +BR2_PACKAGE_LIBQRENCODE_TOOLS=y +BR2_PACKAGE_PIXMAN=y +BR2_PACKAGE_TINYXML2=y +BR2_PACKAGE_LIBNL=y +BR2_PACKAGE_LIBRSYNC=y +BR2_PACKAGE_FMT=y +BR2_PACKAGE_ICU=y +BR2_PACKAGE_ACL=y +BR2_PACKAGE_PROCPS_NG=y +# BR2_TARGET_ROOTFS_TAR is not set +BR2_PACKAGE_HOST_DOSFSTOOLS=y +BR2_PACKAGE_HOST_DTC=y +BR2_PACKAGE_HOST_E2FSPROGS=y +BR2_PACKAGE_HOST_ENVIRONMENT_SETUP=y +BR2_PACKAGE_HOST_GENIMAGE=y +BR2_PACKAGE_HOST_KMOD=y +BR2_PACKAGE_HOST_MKPASSWD=y +BR2_PACKAGE_HOST_MTOOLS=y +BR2_PACKAGE_HOST_SQUASHFS=y +BR2_PACKAGE_HOST_UBOOT_TOOLS=y +BR2_PACKAGE_LIBOPK=y +BR2_PACKAGE_LIBXDGMIME=y +BR2_PACKAGE_AGG=y +BR2_PACKAGE_FLUIDLITE=y +BR2_PACKAGE_LIBMIKMOD=y diff --git a/SDK/external.desc b/SDK/external.desc new file mode 100644 index 0000000..1623a12 --- /dev/null +++ b/SDK/external.desc @@ -0,0 +1,2 @@ +name: SDK +desc: FunKey SDK diff --git a/SDK/external.mk b/SDK/external.mk new file mode 100644 index 0000000..7433fb4 --- /dev/null +++ b/SDK/external.mk @@ -0,0 +1 @@ +include $(sort $(wildcard $(BR2_EXTERNAL_SDK_PATH)/package/*/*.mk)) diff --git a/SDK/package/agg/0001-Fix-non-terminating-loop-conditions-when-len-1.patch b/SDK/package/agg/0001-Fix-non-terminating-loop-conditions-when-len-1.patch new file mode 100644 index 0000000..eaf0467 --- /dev/null +++ b/SDK/package/agg/0001-Fix-non-terminating-loop-conditions-when-len-1.patch @@ -0,0 +1,81 @@ +From efd33aad5e69f36ab343b1f28839a55db4538104 Mon Sep 17 00:00:00 2001 +From: Tom Hughes +Date: Sun, 19 May 2013 10:55:37 +0100 +Subject: [PATCH 01/15] Fix non-terminating loop conditions when len=1 + +- while(abs(sx - lp.x1) + abs(sy - lp.y1) > lp2.len) ++ while(abs(sx - lp.x1) + abs(sy - lp.y1) > 1 + lp2.len) + { + sx = (lp.x1 + sx) >> 1; + sy = (lp.y1 + sy) >> 1; + } +--- + include/agg_renderer_outline_aa.h | 8 ++++---- + include/agg_renderer_outline_image.h | 4 ++-- + 2 files changed, 6 insertions(+), 6 deletions(-) + +diff --git a/include/agg_renderer_outline_aa.h b/include/agg_renderer_outline_aa.h +index ce25a2e..cb2aa00 100644 +--- a/include/agg_renderer_outline_aa.h ++++ b/include/agg_renderer_outline_aa.h +@@ -1659,7 +1659,7 @@ namespace agg + } + else + { +- while(abs(sx - lp.x1) + abs(sy - lp.y1) > lp2.len) ++ while(abs(sx - lp.x1) + abs(sy - lp.y1) > 1 + lp2.len) + { + sx = (lp.x1 + sx) >> 1; + sy = (lp.y1 + sy) >> 1; +@@ -1726,7 +1726,7 @@ namespace agg + } + else + { +- while(abs(ex - lp.x2) + abs(ey - lp.y2) > lp2.len) ++ while(abs(ex - lp.x2) + abs(ey - lp.y2) > 1 + lp2.len) + { + ex = (lp.x2 + ex) >> 1; + ey = (lp.y2 + ey) >> 1; +@@ -1798,7 +1798,7 @@ namespace agg + } + else + { +- while(abs(sx - lp.x1) + abs(sy - lp.y1) > lp2.len) ++ while(abs(sx - lp.x1) + abs(sy - lp.y1) > 1 + lp2.len) + { + sx = (lp.x1 + sx) >> 1; + sy = (lp.y1 + sy) >> 1; +@@ -1811,7 +1811,7 @@ namespace agg + } + else + { +- while(abs(ex - lp.x2) + abs(ey - lp.y2) > lp2.len) ++ while(abs(ex - lp.x2) + abs(ey - lp.y2) > 1 + lp2.len) + { + ex = (lp.x2 + ex) >> 1; + ey = (lp.y2 + ey) >> 1; +diff --git a/include/agg_renderer_outline_image.h b/include/agg_renderer_outline_image.h +index fbfac10..66d2b9a 100644 +--- a/include/agg_renderer_outline_image.h ++++ b/include/agg_renderer_outline_image.h +@@ -969,7 +969,7 @@ namespace agg + } + else + { +- while(abs(sx - lp.x1) + abs(sy - lp.y1) > lp2.len) ++ while(abs(sx - lp.x1) + abs(sy - lp.y1) > 1 + lp2.len) + { + sx = (lp.x1 + sx) >> 1; + sy = (lp.y1 + sy) >> 1; +@@ -982,7 +982,7 @@ namespace agg + } + else + { +- while(abs(ex - lp.x2) + abs(ey - lp.y2) > lp2.len) ++ while(abs(ex - lp.x2) + abs(ey - lp.y2) > 1 + lp2.len) + { + ex = (lp.x2 + ex) >> 1; + ey = (lp.y2 + ey) >> 1; +-- +1.8.1.4 + diff --git a/SDK/package/agg/0001-autogen.patch b/SDK/package/agg/0001-autogen.patch new file mode 100644 index 0000000..b773f12 --- /dev/null +++ b/SDK/package/agg/0001-autogen.patch @@ -0,0 +1,15 @@ +Author: Andrea Veri +Description: Disable configure's run from the autogen file. + +Index: agg-2.5+dfsg1/autogen.sh +=================================================================== +--- a/autogen.sh 2007-10-11 00:06:16.000000000 +0200 ++++ b/autogen.sh 2012-05-01 16:57:37.916862783 +0200 +@@ -18,6 +18,6 @@ + automake --foreign --add-missing --ignore-deps + + # and finally invoke our new configure +-./configure $* ++[ -n "$NOCONFIGURE" ] || ./configure $* + + # end diff --git a/SDK/package/agg/0002-Cure-recursion-by-aborting-if-the-co-ordinates-are-t.patch b/SDK/package/agg/0002-Cure-recursion-by-aborting-if-the-co-ordinates-are-t.patch new file mode 100644 index 0000000..4fe7434 --- /dev/null +++ b/SDK/package/agg/0002-Cure-recursion-by-aborting-if-the-co-ordinates-are-t.patch @@ -0,0 +1,40 @@ +From e269fe9b62af6fe314cebe0ee7a6d6d1a4a84d1c Mon Sep 17 00:00:00 2001 +From: Tom Hughes +Date: Sun, 19 May 2013 11:03:26 +0100 +Subject: [PATCH 02/15] Cure recursion by aborting if the co-ordinates are to + big to handle + +--- + include/agg_rasterizer_cells_aa.h | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/include/agg_rasterizer_cells_aa.h b/include/agg_rasterizer_cells_aa.h +index d3bb138..3a616d9 100644 +--- a/include/agg_rasterizer_cells_aa.h ++++ b/include/agg_rasterizer_cells_aa.h +@@ -40,7 +40,8 @@ + #define AGG_RASTERIZER_CELLS_AA_INCLUDED + + #include +-#include ++#include ++#include + #include "agg_math.h" + #include "agg_array.h" + +@@ -333,6 +334,12 @@ namespace agg + { + int cx = (x1 + x2) >> 1; + int cy = (y1 + y2) >> 1; ++ ++ // Bail if values are so large they are likely to wrap ++ if ((std::abs(x1) >= std::numeric_limits::max()/2) || (std::abs(y1) >= std::numeric_limits::max()/2) || ++ (std::abs(x2) >= std::numeric_limits::max()/2) || (std::abs(y2) >= std::numeric_limits::max()/2)) ++ return; ++ + line(x1, y1, cx, cy); + line(cx, cy, x2, y2); + } +-- +1.8.1.4 + diff --git a/SDK/package/agg/0002-caca.patch b/SDK/package/agg/0002-caca.patch new file mode 100644 index 0000000..f98a573 --- /dev/null +++ b/SDK/package/agg/0002-caca.patch @@ -0,0 +1,34 @@ +--- a/configure.in ++++ b/configure.in +@@ -39,7 +39,7 @@ + # used as platform library in examples: + # todo, make the PREFERED_PLATFORM selectable, after the set of possible + # Platforms to link the examples have been evaluated. +-PREFERED_PLATFORM=X11 ++PREFERED_PLATFORM=sdl + case "$host" in + *darwin* ) + OSX_LIBS="-framework Carbon -framework QuickTime" +@@ -120,9 +120,7 @@ + if test "$no_x" = "yes"; then + AC_MSG_WARN([*** X11 not found! Omitting X11 layer.]) + fi +-AM_CONDITIONAL(ENABLE_X11,[test x$no_x = x -a xno != x$enable_platform -a x$win32_host != xyes]) +-AC_SUBST(x_includes) +-AC_SUBST(x_libraries) ++AM_CONDITIONAL(ENABLE_X11,0) + dnl ############################################### + + dnl Settung up library version + +--- a/include/agg_renderer_outline_aa.h ++++ b/include/agg_renderer_outline_aa.h +@@ -1375,7 +1375,7 @@ + //--------------------------------------------------------------------- + void profile(const line_profile_aa& prof) { m_profile = &prof; } + const line_profile_aa& profile() const { return *m_profile; } +- line_profile_aa& profile() { return *m_profile; } ++// line_profile_aa& profile() { return *m_profile; } + + //--------------------------------------------------------------------- + int subpixel_width() const { return m_profile->subpixel_width(); } diff --git a/SDK/package/agg/0003-Get-coordinates-from-previous-vertex-if-last-command.patch b/SDK/package/agg/0003-Get-coordinates-from-previous-vertex-if-last-command.patch new file mode 100644 index 0000000..b12684d --- /dev/null +++ b/SDK/package/agg/0003-Get-coordinates-from-previous-vertex-if-last-command.patch @@ -0,0 +1,30 @@ +From 032d5342430f4c5dfbc34a2817d67386a14fd51b Mon Sep 17 00:00:00 2001 +From: Tom Hughes +Date: Sun, 19 May 2013 11:40:49 +0100 +Subject: [PATCH 03/15] Get coordinates from previous vertex if last command is + path_cmd_end_poly + +--- + include/agg_path_storage.h | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/include/agg_path_storage.h b/include/agg_path_storage.h +index 7be7393..8922fc8 100644 +--- a/include/agg_path_storage.h ++++ b/include/agg_path_storage.h +@@ -878,6 +878,12 @@ namespace agg + *x += x2; + *y += y2; + } ++ else if (!is_stop(m_vertices.last_command()) && ++ is_vertex(m_vertices.prev_vertex(&x2, &y2))) ++ { ++ *x += x2; ++ *y += y2; ++ } + } + } + +-- +1.8.1.4 + diff --git a/SDK/package/agg/0004-Make-rasterizer_outline_aa-ignore-close_polygon-when.patch b/SDK/package/agg/0004-Make-rasterizer_outline_aa-ignore-close_polygon-when.patch new file mode 100644 index 0000000..0cecaf7 --- /dev/null +++ b/SDK/package/agg/0004-Make-rasterizer_outline_aa-ignore-close_polygon-when.patch @@ -0,0 +1,138 @@ +From b9c4b1c72b4ad6b24c37f402d3eec39ef393b0eb Mon Sep 17 00:00:00 2001 +From: Tom Hughes +Date: Sun, 19 May 2013 14:17:43 +0100 +Subject: [PATCH 04/15] Make rasterizer_outline_aa ignore close_polygon when + vertex count < 3 + +--- + include/agg_rasterizer_outline_aa.h | 107 ++++++++++++++++++------------------ + 1 file changed, 52 insertions(+), 55 deletions(-) + +diff --git a/include/agg_rasterizer_outline_aa.h b/include/agg_rasterizer_outline_aa.h +index 4d6dd57..24301d5 100644 +--- a/include/agg_rasterizer_outline_aa.h ++++ b/include/agg_rasterizer_outline_aa.h +@@ -333,68 +333,65 @@ namespace agg + int y2; + int lprev; + +- if(close_polygon) ++ if(close_polygon && (m_src_vertices.size() >= 3)) + { +- if(m_src_vertices.size() >= 3) ++ dv.idx = 2; ++ ++ v = &m_src_vertices[m_src_vertices.size() - 1]; ++ x1 = v->x; ++ y1 = v->y; ++ lprev = v->len; ++ ++ v = &m_src_vertices[0]; ++ x2 = v->x; ++ y2 = v->y; ++ dv.lcurr = v->len; ++ line_parameters prev(x1, y1, x2, y2, lprev); ++ ++ v = &m_src_vertices[1]; ++ dv.x1 = v->x; ++ dv.y1 = v->y; ++ dv.lnext = v->len; ++ dv.curr = line_parameters(x2, y2, dv.x1, dv.y1, dv.lcurr); ++ ++ v = &m_src_vertices[dv.idx]; ++ dv.x2 = v->x; ++ dv.y2 = v->y; ++ dv.next = line_parameters(dv.x1, dv.y1, dv.x2, dv.y2, dv.lnext); ++ ++ dv.xb1 = 0; ++ dv.yb1 = 0; ++ dv.xb2 = 0; ++ dv.yb2 = 0; ++ ++ switch(m_line_join) + { +- dv.idx = 2; +- +- v = &m_src_vertices[m_src_vertices.size() - 1]; +- x1 = v->x; +- y1 = v->y; +- lprev = v->len; +- +- v = &m_src_vertices[0]; +- x2 = v->x; +- y2 = v->y; +- dv.lcurr = v->len; +- line_parameters prev(x1, y1, x2, y2, lprev); +- +- v = &m_src_vertices[1]; +- dv.x1 = v->x; +- dv.y1 = v->y; +- dv.lnext = v->len; +- dv.curr = line_parameters(x2, y2, dv.x1, dv.y1, dv.lcurr); +- +- v = &m_src_vertices[dv.idx]; +- dv.x2 = v->x; +- dv.y2 = v->y; +- dv.next = line_parameters(dv.x1, dv.y1, dv.x2, dv.y2, dv.lnext); +- +- dv.xb1 = 0; +- dv.yb1 = 0; +- dv.xb2 = 0; +- dv.yb2 = 0; +- +- switch(m_line_join) +- { +- case outline_no_join: +- dv.flags = 3; +- break; ++ case outline_no_join: ++ dv.flags = 3; ++ break; + +- case outline_miter_join: +- case outline_round_join: +- dv.flags = +- (prev.diagonal_quadrant() == dv.curr.diagonal_quadrant()) | +- ((dv.curr.diagonal_quadrant() == dv.next.diagonal_quadrant()) << 1); +- break; ++ case outline_miter_join: ++ case outline_round_join: ++ dv.flags = ++ (prev.diagonal_quadrant() == dv.curr.diagonal_quadrant()) | ++ ((dv.curr.diagonal_quadrant() == dv.next.diagonal_quadrant()) << 1); ++ break; + +- case outline_miter_accurate_join: +- dv.flags = 0; +- break; +- } ++ case outline_miter_accurate_join: ++ dv.flags = 0; ++ break; ++ } + +- if((dv.flags & 1) == 0 && m_line_join != outline_round_join) +- { +- bisectrix(prev, dv.curr, &dv.xb1, &dv.yb1); +- } ++ if((dv.flags & 1) == 0 && m_line_join != outline_round_join) ++ { ++ bisectrix(prev, dv.curr, &dv.xb1, &dv.yb1); ++ } + +- if((dv.flags & 2) == 0 && m_line_join != outline_round_join) +- { +- bisectrix(dv.curr, dv.next, &dv.xb2, &dv.yb2); +- } +- draw(dv, 0, m_src_vertices.size()); ++ if((dv.flags & 2) == 0 && m_line_join != outline_round_join) ++ { ++ bisectrix(dv.curr, dv.next, &dv.xb2, &dv.yb2); + } ++ draw(dv, 0, m_src_vertices.size()); + } + else + { +-- +1.8.1.4 + diff --git a/SDK/package/agg/0005-Remove-VC-6-workaround.patch b/SDK/package/agg/0005-Remove-VC-6-workaround.patch new file mode 100644 index 0000000..f38f7c4 --- /dev/null +++ b/SDK/package/agg/0005-Remove-VC-6-workaround.patch @@ -0,0 +1,52 @@ +From b8c43fb0ba13af0cc2b1050f48f81d76d2fdf0c7 Mon Sep 17 00:00:00 2001 +From: Tom Hughes +Date: Sun, 19 May 2013 15:04:05 +0100 +Subject: [PATCH 05/15] Remove VC++ 6 workaround + +--- + include/agg_renderer_scanline.h | 29 +---------------------------- + 1 file changed, 1 insertion(+), 28 deletions(-) + +diff --git a/include/agg_renderer_scanline.h b/include/agg_renderer_scanline.h +index c3bb6f0..c27ca60 100644 +--- a/include/agg_renderer_scanline.h ++++ b/include/agg_renderer_scanline.h +@@ -79,34 +79,7 @@ namespace agg + sl.reset(ras.min_x(), ras.max_x()); + while(ras.sweep_scanline(sl)) + { +- //render_scanline_aa_solid(sl, ren, ren_color); +- +- // This code is equivalent to the above call (copy/paste). +- // It's just a "manual" optimization for old compilers, +- // like Microsoft Visual C++ v6.0 +- //------------------------------- +- int y = sl.y(); +- unsigned num_spans = sl.num_spans(); +- typename Scanline::const_iterator span = sl.begin(); +- +- for(;;) +- { +- int x = span->x; +- if(span->len > 0) +- { +- ren.blend_solid_hspan(x, y, (unsigned)span->len, +- ren_color, +- span->covers); +- } +- else +- { +- ren.blend_hline(x, y, (unsigned)(x - span->len - 1), +- ren_color, +- *(span->covers)); +- } +- if(--num_spans == 0) break; +- ++span; +- } ++ render_scanline_aa_solid(sl, ren, ren_color); + } + } + } +-- +1.8.1.4 + diff --git a/SDK/package/agg/0006-Implement-grain-merge-blending-mode-GIMP.patch b/SDK/package/agg/0006-Implement-grain-merge-blending-mode-GIMP.patch new file mode 100644 index 0000000..f1e465b --- /dev/null +++ b/SDK/package/agg/0006-Implement-grain-merge-blending-mode-GIMP.patch @@ -0,0 +1,85 @@ +From 9422570f4e099a834fc43619f7b2a7eb6b442e25 Mon Sep 17 00:00:00 2001 +From: Tom Hughes +Date: Sun, 19 May 2013 15:31:01 +0100 +Subject: [PATCH 06/15] Implement grain-merge blending mode (GIMP) + +--- + include/agg_pixfmt_rgba.h | 42 ++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 40 insertions(+), 2 deletions(-) + +diff --git a/include/agg_pixfmt_rgba.h b/include/agg_pixfmt_rgba.h +index 79d10dc..f576ce4 100644 +--- a/include/agg_pixfmt_rgba.h ++++ b/include/agg_pixfmt_rgba.h +@@ -1401,9 +1401,46 @@ namespace agg + } + }; + ++ //================================================comp_op_rgba_grain_merge ++ template struct comp_op_rgba_grain_merge ++ { ++ typedef ColorT color_type; ++ typedef Order order_type; ++ typedef typename color_type::value_type value_type; ++ typedef typename color_type::calc_type calc_type; ++ typedef typename color_type::long_type long_type; ++ enum base_scale_e ++ { ++ base_shift = color_type::base_shift, ++ base_mask = color_type::base_mask ++ }; + ++ // E = I + M - 128 ++ static AGG_INLINE void blend_pix(value_type* p, ++ unsigned sr, unsigned sg, unsigned sb, ++ unsigned sa, unsigned cover) ++ { + +- ++ if(cover < 255) ++ { ++ sr = (sr * cover + 255) >> 8; ++ sg = (sg * cover + 255) >> 8; ++ sb = (sb * cover + 255) >> 8; ++ sa = (sa * cover + 255) >> 8; ++ } ++ if(sa) ++ { ++ calc_type da = p[Order::A]; ++ int dr = sr + p[Order::R] - 128; ++ int dg = sg + p[Order::G] - 128; ++ int db = sb + p[Order::B] - 128; ++ p[Order::R] = (value_type)(dr < 0 ? 0 : (dr > 255 ? 255 : dr)); ++ p[Order::G] = (value_type)(dg < 0 ? 0 : (dg > 255 ? 255 : dg)); ++ p[Order::B] = (value_type)(db < 0 ? 0 : (db > 255 ? 255 : db)); ++ p[Order::A] = (value_type)(sa + da - ((sa * da + base_mask) >> base_shift)); ++ } ++ } ++ }; + + //======================================================comp_op_table_rgba + template struct comp_op_table_rgba +@@ -1451,6 +1488,7 @@ namespace agg + comp_op_rgba_contrast ::blend_pix, + comp_op_rgba_invert ::blend_pix, + comp_op_rgba_invert_rgb ::blend_pix, ++ comp_op_rgba_grain_merge::blend_pix, + 0 + }; + +@@ -1486,6 +1524,7 @@ namespace agg + comp_op_contrast, //----comp_op_contrast + comp_op_invert, //----comp_op_invert + comp_op_invert_rgb, //----comp_op_invert_rgb ++ comp_op_grain_merge, //----comp_op_grain_merge + + end_of_comp_op_e + }; +@@ -2908,4 +2947,3 @@ namespace agg + } + + #endif +- +-- +1.8.1.4 + diff --git a/SDK/package/agg/0007-Implement-grain-extract-blending-mode-GIMP.patch b/SDK/package/agg/0007-Implement-grain-extract-blending-mode-GIMP.patch new file mode 100644 index 0000000..cafb36e --- /dev/null +++ b/SDK/package/agg/0007-Implement-grain-extract-blending-mode-GIMP.patch @@ -0,0 +1,85 @@ +From abd440342e166a90d08610bf5b31d2a8357eafbe Mon Sep 17 00:00:00 2001 +From: Tom Hughes +Date: Sun, 19 May 2013 15:43:18 +0100 +Subject: [PATCH 07/15] Implement grain-extract blending mode (GIMP) + +--- + include/agg_pixfmt_rgba.h | 48 +++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 48 insertions(+) + +diff --git a/include/agg_pixfmt_rgba.h b/include/agg_pixfmt_rgba.h +index f576ce4..42f0a05 100644 +--- a/include/agg_pixfmt_rgba.h ++++ b/include/agg_pixfmt_rgba.h +@@ -1442,6 +1442,52 @@ namespace agg + } + }; + ++ //==============================================comp_op_rgba_grain_extract ++ template struct comp_op_rgba_grain_extract ++ { ++ typedef ColorT color_type; ++ typedef Order order_type; ++ typedef typename color_type::value_type value_type; ++ typedef typename color_type::calc_type calc_type; ++ typedef typename color_type::long_type long_type; ++ enum base_scale_e ++ { ++ base_shift = color_type::base_shift, ++ base_mask = color_type::base_mask ++ }; ++ ++ // E = I - M + 128 ++ static AGG_INLINE void blend_pix(value_type* p, ++ unsigned sr, unsigned sg, unsigned sb, ++ unsigned sa, unsigned cover) ++ { ++ calc_type da = (p[Order::A] * sa + 255) >> 8; ++ ++ int dr = p[Order::R] - sr + 128; ++ int dg = p[Order::G] - sg + 128; ++ int db = p[Order::B] - sb + 128; ++ ++ dr = dr < 0 ? 0 : (dr > 255 ? 255 : dr); ++ dg = dg < 0 ? 0 : (dg > 255 ? 255 : dg); ++ db = db < 0 ? 0 : (db > 255 ? 255 : db); ++ ++ p[Order::A] = da; ++ ++ if(da < 255) ++ { ++ p[Order::R] = (dr * da + 255) >> 8; ++ p[Order::G] = (dg * da + 255) >> 8; ++ p[Order::B] = (db * da + 255) >> 8; ++ } ++ else ++ { ++ p[Order::R] = dr; ++ p[Order::G] = dg; ++ p[Order::B] = db; ++ } ++ } ++ }; ++ + //======================================================comp_op_table_rgba + template struct comp_op_table_rgba + { +@@ -1489,6 +1535,7 @@ namespace agg + comp_op_rgba_invert ::blend_pix, + comp_op_rgba_invert_rgb ::blend_pix, + comp_op_rgba_grain_merge::blend_pix, ++ comp_op_rgba_grain_extract::blend_pix, + 0 + }; + +@@ -1525,6 +1572,7 @@ namespace agg + comp_op_invert, //----comp_op_invert + comp_op_invert_rgb, //----comp_op_invert_rgb + comp_op_grain_merge, //----comp_op_grain_merge ++ comp_op_grain_extract, //----comp_op_grain_extract + + end_of_comp_op_e + }; +-- +1.8.1.4 + diff --git a/SDK/package/agg/0008-Declare-multiplication-and-division-operators-as-con.patch b/SDK/package/agg/0008-Declare-multiplication-and-division-operators-as-con.patch new file mode 100644 index 0000000..0ed92ee --- /dev/null +++ b/SDK/package/agg/0008-Declare-multiplication-and-division-operators-as-con.patch @@ -0,0 +1,36 @@ +From 2688af280836b95908d3cfd6915510d55de673b8 Mon Sep 17 00:00:00 2001 +From: Tom Hughes +Date: Sun, 19 May 2013 16:15:01 +0100 +Subject: [PATCH 08/15] Declare multiplication and division operators as const + +--- + include/agg_trans_affine.h | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/include/agg_trans_affine.h b/include/agg_trans_affine.h +index a662099..2f602a0 100644 +--- a/include/agg_trans_affine.h ++++ b/include/agg_trans_affine.h +@@ -216,15 +216,15 @@ namespace agg + } + + // Multiply the matrix by another one and return +- // the result in a separete matrix. +- trans_affine operator * (const trans_affine& m) ++ // the result in a separate matrix. ++ trans_affine operator * (const trans_affine& m) const + { + return trans_affine(*this).multiply(m); + } + + // Multiply the matrix by inverse of another one +- // and return the result in a separete matrix. +- trans_affine operator / (const trans_affine& m) ++ // and return the result in a separate matrix. ++ trans_affine operator / (const trans_affine& m) const + { + return trans_affine(*this).multiply_inv(m); + } +-- +1.8.1.4 + diff --git a/SDK/package/agg/0009-Add-a-static-identity-transformation.patch b/SDK/package/agg/0009-Add-a-static-identity-transformation.patch new file mode 100644 index 0000000..01555cb --- /dev/null +++ b/SDK/package/agg/0009-Add-a-static-identity-transformation.patch @@ -0,0 +1,37 @@ +From be9ed90897bc43b4547a3a1f8046827caaf13b4c Mon Sep 17 00:00:00 2001 +From: Tom Hughes +Date: Sun, 19 May 2013 16:15:36 +0100 +Subject: [PATCH 09/15] Add a static identity transformation + +--- + include/agg_trans_affine.h | 1 + + src/agg_trans_affine.cpp | 1 + + 2 files changed, 2 insertions(+) + +diff --git a/include/agg_trans_affine.h b/include/agg_trans_affine.h +index 2f602a0..67fe5ca 100644 +--- a/include/agg_trans_affine.h ++++ b/include/agg_trans_affine.h +@@ -92,6 +92,7 @@ namespace agg + //---------------------------------------------------------------------- + struct trans_affine + { ++ static const trans_affine identity; + double sx, shy, shx, sy, tx, ty; + + //------------------------------------------ Construction +diff --git a/src/agg_trans_affine.cpp b/src/agg_trans_affine.cpp +index aca18c2..b3d9bc0 100644 +--- a/src/agg_trans_affine.cpp ++++ b/src/agg_trans_affine.cpp +@@ -28,6 +28,7 @@ + + namespace agg + { ++ const trans_affine trans_affine::identity; + + //------------------------------------------------------------------------ + const trans_affine& trans_affine::parl_to_parl(const double* src, +-- +1.8.1.4 + diff --git a/SDK/package/agg/0010-Add-renderer_scanline_aa_alpha.patch b/SDK/package/agg/0010-Add-renderer_scanline_aa_alpha.patch new file mode 100644 index 0000000..b0be258 --- /dev/null +++ b/SDK/package/agg/0010-Add-renderer_scanline_aa_alpha.patch @@ -0,0 +1,193 @@ +From 749c8cd11e9e6f81e93ae5ce19258431722b6bdf Mon Sep 17 00:00:00 2001 +From: Tom Hughes +Date: Sun, 19 May 2013 16:43:25 +0100 +Subject: [PATCH 10/15] Add renderer_scanline_aa_alpha + +--- + include/agg_pixfmt_rgba.h | 24 +++++++++++++- + include/agg_renderer_base.h | 28 ++++++++++++++++ + include/agg_renderer_scanline.h | 71 +++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 122 insertions(+), 1 deletion(-) + +diff --git a/include/agg_pixfmt_rgba.h b/include/agg_pixfmt_rgba.h +index 42f0a05..6c4bc37 100644 +--- a/include/agg_pixfmt_rgba.h ++++ b/include/agg_pixfmt_rgba.h +@@ -2247,7 +2247,6 @@ namespace agg + } + + +- + //-------------------------------------------------------------------- + void blend_color_vspan(int x, int y, + unsigned len, +@@ -2751,6 +2750,29 @@ namespace agg + } + + //-------------------------------------------------------------------- ++ void blend_color_hspan_alpha(int x, int y, unsigned len, ++ const color_type* colors, ++ value_type alpha, ++ const int8u* covers, ++ int8u cover) ++ { ++ value_type* p = (value_type*)m_rbuf->row_ptr(x, y, len) + (x << 2); ++ do ++ { ++ blender_type::blend_pix(m_comp_op, ++ p, ++ (colors->r * alpha + 255) >> 8, ++ (colors->g * alpha + 255) >> 8, ++ (colors->b * alpha + 255) >> 8, ++ (colors->a * alpha + 255) >> 8, ++ covers ? *covers++ : cover); ++ p += 4; ++ ++colors; ++ } ++ while(--len); ++ } ++ ++ //-------------------------------------------------------------------- + void blend_color_vspan(int x, int y, unsigned len, + const color_type* colors, + const int8u* covers, +diff --git a/include/agg_renderer_base.h b/include/agg_renderer_base.h +index 1808944..25f07c3 100644 +--- a/include/agg_renderer_base.h ++++ b/include/agg_renderer_base.h +@@ -37,6 +37,7 @@ namespace agg + public: + typedef PixelFormat pixfmt_type; + typedef typename pixfmt_type::color_type color_type; ++ typedef typename pixfmt_type::color_type::value_type value_type; + typedef typename pixfmt_type::row_data row_data; + + //-------------------------------------------------------------------- +@@ -383,6 +384,33 @@ namespace agg + } + + //-------------------------------------------------------------------- ++ void blend_color_hspan_alpha(int x, int y, int len, ++ const color_type* colors, ++ value_type alpha, ++ const cover_type* covers, ++ cover_type cover = agg::cover_full) ++ { ++ if(y > ymax()) return; ++ if(y < ymin()) return; ++ ++ if(x < xmin()) ++ { ++ int d = xmin() - x; ++ len -= d; ++ if(len <= 0) return; ++ if(covers) covers += d; ++ colors += d; ++ x = xmin(); ++ } ++ if(x + len > xmax()) ++ { ++ len = xmax() - x + 1; ++ if(len <= 0) return; ++ } ++ m_ren->blend_color_hspan_alpha(x, y, len, colors, alpha, covers, cover); ++ } ++ ++ //-------------------------------------------------------------------- + void blend_color_vspan(int x, int y, int len, + const color_type* colors, + const cover_type* covers, +diff --git a/include/agg_renderer_scanline.h b/include/agg_renderer_scanline.h +index c27ca60..4fcb557 100644 +--- a/include/agg_renderer_scanline.h ++++ b/include/agg_renderer_scanline.h +@@ -156,6 +156,35 @@ namespace agg + } + } + ++ //================================================render_scanline_aa_alpha ++ template ++ void render_scanline_aa_alpha(const Scanline& sl, BaseRenderer& ren, ++ SpanAllocator& alloc, SpanGenerator& span_gen, ++ unsigned alpha) ++ { ++ int y = sl.y(); ++ ++ unsigned num_spans = sl.num_spans(); ++ typename Scanline::const_iterator span = sl.begin(); ++ for(;;) ++ { ++ int x = span->x; ++ int len = span->len; ++ const typename Scanline::cover_type* covers = span->covers; ++ ++ if(len < 0) len = -len; ++ typename BaseRenderer::color_type* colors = alloc.allocate(len); ++ span_gen.generate(colors, x, y, len); ++ ren.blend_color_hspan_alpha(x, y, len, colors, alpha, ++ (span->len < 0) ? 0 : covers, *covers); ++ ++ if(--num_spans == 0) break; ++ ++span; ++ } ++ } ++ ++ + //=====================================================render_scanlines_aa + template +@@ -216,8 +245,50 @@ namespace agg + }; + + ++ //==============================================renderer_scanline_aa_alpha ++ template ++ class renderer_scanline_aa_alpha ++ { ++ public: ++ typedef BaseRenderer base_ren_type; ++ typedef SpanAllocator alloc_type; ++ typedef SpanGenerator span_gen_type; + ++ //-------------------------------------------------------------------- ++ renderer_scanline_aa_alpha() : m_ren(0), m_alloc(0), m_span_gen(0), m_alpha(1.0) {} ++ renderer_scanline_aa_alpha(base_ren_type& ren, ++ alloc_type& alloc, ++ span_gen_type& span_gen, ++ unsigned alpha) : ++ m_ren(&ren), ++ m_alloc(&alloc), ++ m_span_gen(&span_gen), ++ m_alpha(alpha) ++ {} ++ void attach(base_ren_type& ren, ++ alloc_type& alloc, ++ span_gen_type& span_gen) ++ { ++ m_ren = &ren; ++ m_alloc = &alloc; ++ m_span_gen = &span_gen; ++ } + ++ //-------------------------------------------------------------------- ++ void prepare() { m_span_gen->prepare(); } ++ ++ //-------------------------------------------------------------------- ++ template void render(const Scanline& sl) ++ { ++ render_scanline_aa_alpha(sl, *m_ren, *m_alloc, *m_span_gen, m_alpha); ++ } ++ ++ private: ++ base_ren_type* m_ren; ++ alloc_type* m_alloc; ++ span_gen_type* m_span_gen; ++ unsigned m_alpha; ++ }; + + + //===============================================render_scanline_bin_solid +-- +1.8.1.4 + diff --git a/SDK/package/agg/0011-Avoid-division-by-zero-in-color-burn-mode.patch b/SDK/package/agg/0011-Avoid-division-by-zero-in-color-burn-mode.patch new file mode 100644 index 0000000..2a0d198 --- /dev/null +++ b/SDK/package/agg/0011-Avoid-division-by-zero-in-color-burn-mode.patch @@ -0,0 +1,58 @@ +From 0ec68d7f5695403eccac75025ba7f6f7ecf1814e Mon Sep 17 00:00:00 2001 +From: Tom Hughes +Date: Sun, 19 May 2013 16:49:08 +0100 +Subject: [PATCH 11/15] Avoid division by zero in color-burn mode + +FIXME: re-work using latest math from http://www.w3.org/TR/SVGCompositing/ +--- + include/agg_pixfmt_rgba.h | 21 ++++++++++++++++++--- + 1 file changed, 18 insertions(+), 3 deletions(-) + +diff --git a/include/agg_pixfmt_rgba.h b/include/agg_pixfmt_rgba.h +index 6c4bc37..5d6b511 100644 +--- a/include/agg_pixfmt_rgba.h ++++ b/include/agg_pixfmt_rgba.h +@@ -1027,6 +1027,21 @@ namespace agg + // Dca' = Sa.(Sca.Da + Dca.Sa - Sa.Da)/Sca + Sca.(1 - Da) + Dca.(1 - Sa) + // + // Da' = Sa + Da - Sa.Da ++ ++ ++ // http://www.w3.org/TR/SVGCompositing/ ++ // if Sca == 0 and Dca == Da ++ // Dca' = Sa × Da + Sca × (1 - Da) + Dca × (1 - Sa) ++ // = Sa × Da + Dca × (1 - Sa) ++ // = Da = Dca ++ // otherwise if Sca == 0 ++ // Dca' = Sca × (1 - Da) + Dca × (1 - Sa) ++ // = Dca × (1 - Sa) ++ // otherwise if Sca > 0 ++ // Dca' = Sa × Da - Sa × Da × min(1, (1 - Dca/Da) × Sa/Sca) + Sca × (1 - Da) + Dca × (1 - Sa) ++ // = Sa × Da × (1 - min(1, (1 - Dca/Da) × Sa/Sca)) + Sca × (1 - Da) + Dca × (1 - Sa) ++ ++ // sa * da * (255 - std::min(255, (255 - p[0]/da)*(sa/(sc*sa)) + + static AGG_INLINE void blend_pix(value_type* p, + unsigned sr, unsigned sg, unsigned sb, + unsigned sa, unsigned cover) +@@ -1056,15 +1071,15 @@ namespace agg + + p[Order::R] = (value_type)(((srda + drsa <= sada) ? + sr * d1a + dr * s1a : +- sa * (srda + drsa - sada) / sr + sr * d1a + dr * s1a + base_mask) >> base_shift); ++ (sr > 0 ? sa * (srda + drsa - sada) / sr + sr * d1a + dr * s1a + base_mask : 0)) >> base_shift); + + p[Order::G] = (value_type)(((sgda + dgsa <= sada) ? + sg * d1a + dg * s1a : +- sa * (sgda + dgsa - sada) / sg + sg * d1a + dg * s1a + base_mask) >> base_shift); ++ (sg > 0 ? sa * (sgda + dgsa - sada) / sg + sg * d1a + dg * s1a + base_mask : 0)) >> base_shift); + + p[Order::B] = (value_type)(((sbda + dbsa <= sada) ? + sb * d1a + db * s1a : +- sa * (sbda + dbsa - sada) / sb + sb * d1a + db * s1a + base_mask) >> base_shift); ++ (sb > 0 ? sa * (sbda + dbsa - sada) / sb + sb * d1a + db * s1a + base_mask : 0)) >> base_shift); + + p[Order::A] = (value_type)(sa + da - ((sa * da + base_mask) >> base_shift)); + } +-- +1.8.1.4 + diff --git a/SDK/package/agg/0012-Avoid-pixel-artifacts-when-compositing.patch b/SDK/package/agg/0012-Avoid-pixel-artifacts-when-compositing.patch new file mode 100644 index 0000000..b3e641e --- /dev/null +++ b/SDK/package/agg/0012-Avoid-pixel-artifacts-when-compositing.patch @@ -0,0 +1,26 @@ +From bf0e0b71360cfbc690a29f4abe15d7b9b61b8479 Mon Sep 17 00:00:00 2001 +From: Tom Hughes +Date: Sat, 22 Jun 2013 12:11:54 +0100 +Subject: [PATCH 12/15] Avoid pixel artifacts when compositing + +Change src_over alpha to avoid pixel artifacts by reordering computations. +--- + include/agg_pixfmt_rgba.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/agg_pixfmt_rgba.h b/include/agg_pixfmt_rgba.h +index 5d6b511..bb255cd 100644 +--- a/include/agg_pixfmt_rgba.h ++++ b/include/agg_pixfmt_rgba.h +@@ -346,7 +346,7 @@ namespace agg + p[Order::R] = (value_type)(sr + ((p[Order::R] * s1a + base_mask) >> base_shift)); + p[Order::G] = (value_type)(sg + ((p[Order::G] * s1a + base_mask) >> base_shift)); + p[Order::B] = (value_type)(sb + ((p[Order::B] * s1a + base_mask) >> base_shift)); +- p[Order::A] = (value_type)(sa + p[Order::A] - ((sa * p[Order::A] + base_mask) >> base_shift)); ++ p[Order::A] = (value_type)(sa + ((p[Order::A] * s1a + base_mask) >> base_shift)); + } + }; + +-- +1.8.1.4 + diff --git a/SDK/package/agg/0013-Modify-agg-conv-classes-to-allow-access-to-the-origi.patch b/SDK/package/agg/0013-Modify-agg-conv-classes-to-allow-access-to-the-origi.patch new file mode 100644 index 0000000..9deb904 --- /dev/null +++ b/SDK/package/agg/0013-Modify-agg-conv-classes-to-allow-access-to-the-origi.patch @@ -0,0 +1,93 @@ +From 6f1ab5f4b470bcf4e7e72aac6e2f7f6ee3e7b424 Mon Sep 17 00:00:00 2001 +From: Tom Hughes +Date: Sat, 22 Jun 2013 12:16:42 +0100 +Subject: [PATCH 13/15] Modify agg conv classes to allow access to the original + geometry type + +--- + include/agg_conv_adaptor_vcgen.h | 2 ++ + include/agg_conv_adaptor_vpgen.h | 1 + + include/agg_conv_clip_polygon.h | 1 + + include/agg_conv_clip_polyline.h | 1 + + include/agg_conv_smooth_poly1.h | 2 ++ + 5 files changed, 7 insertions(+) + +diff --git a/include/agg_conv_adaptor_vcgen.h b/include/agg_conv_adaptor_vcgen.h +index 7bd9b07..fef4579 100644 +--- a/include/agg_conv_adaptor_vcgen.h ++++ b/include/agg_conv_adaptor_vcgen.h +@@ -38,6 +38,7 @@ namespace agg + + void rewind(unsigned) {} + unsigned vertex(double*, double*) { return path_cmd_stop; } ++ unsigned type() const { return 0; } + }; + + +@@ -73,6 +74,7 @@ namespace agg + } + + unsigned vertex(double* x, double* y); ++ unsigned type() const { return m_source->type(); } + + private: + // Prohibit copying +diff --git a/include/agg_conv_adaptor_vpgen.h b/include/agg_conv_adaptor_vpgen.h +index dca9415..a39102d 100644 +--- a/include/agg_conv_adaptor_vpgen.h ++++ b/include/agg_conv_adaptor_vpgen.h +@@ -42,6 +42,7 @@ namespace agg + + void rewind(unsigned path_id); + unsigned vertex(double* x, double* y); ++ unsigned type() const { return m_source->type(); } + + private: + conv_adaptor_vpgen(const conv_adaptor_vpgen&); +diff --git a/include/agg_conv_clip_polygon.h b/include/agg_conv_clip_polygon.h +index 3c34590..e417a7d 100644 +--- a/include/agg_conv_clip_polygon.h ++++ b/include/agg_conv_clip_polygon.h +@@ -60,6 +60,7 @@ namespace agg + double y1() const { return base_type::vpgen().y1(); } + double x2() const { return base_type::vpgen().x2(); } + double y2() const { return base_type::vpgen().y2(); } ++ unsigned type() const { return base_type::type(); } + + private: + conv_clip_polygon(const conv_clip_polygon&); +diff --git a/include/agg_conv_clip_polyline.h b/include/agg_conv_clip_polyline.h +index d45067f..0de4b57 100644 +--- a/include/agg_conv_clip_polyline.h ++++ b/include/agg_conv_clip_polyline.h +@@ -60,6 +60,7 @@ namespace agg + double y1() const { return base_type::vpgen().y1(); } + double x2() const { return base_type::vpgen().x2(); } + double y2() const { return base_type::vpgen().y2(); } ++ unsigned type() const { return base_type::type(); } + + private: + conv_clip_polyline(const conv_clip_polyline&); +diff --git a/include/agg_conv_smooth_poly1.h b/include/agg_conv_smooth_poly1.h +index 15f7f8d..0956c4e 100644 +--- a/include/agg_conv_smooth_poly1.h ++++ b/include/agg_conv_smooth_poly1.h +@@ -48,6 +48,7 @@ namespace agg + + void smooth_value(double v) { base_type::generator().smooth_value(v); } + double smooth_value() const { return base_type::generator().smooth_value(); } ++ unsigned type() const { return base_type::type(); } + + private: + conv_smooth_poly1(const conv_smooth_poly1&); +@@ -70,6 +71,7 @@ namespace agg + + void smooth_value(double v) { m_smooth.generator().smooth_value(v); } + double smooth_value() const { return m_smooth.generator().smooth_value(); } ++ unsigned type() const { return m_smooth.type(); } + + private: + conv_smooth_poly1_curve(const conv_smooth_poly1_curve&); +-- +1.8.1.4 + diff --git a/SDK/package/agg/0014-Avoid-potential-zero-division-resulting-in-nan-in-ag.patch b/SDK/package/agg/0014-Avoid-potential-zero-division-resulting-in-nan-in-ag.patch new file mode 100644 index 0000000..547b0d2 --- /dev/null +++ b/SDK/package/agg/0014-Avoid-potential-zero-division-resulting-in-nan-in-ag.patch @@ -0,0 +1,30 @@ +From 6433a64f4cd41e88499386b0b7c7ae05d30683b8 Mon Sep 17 00:00:00 2001 +From: Tom Hughes +Date: Sat, 22 Jun 2013 12:33:32 +0100 +Subject: [PATCH 14/15] Avoid potential zero division resulting in nan in + agg::gamma_linear + +--- + include/agg_gamma_functions.h | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/include/agg_gamma_functions.h b/include/agg_gamma_functions.h +index fa38a45..beb0c04 100644 +--- a/include/agg_gamma_functions.h ++++ b/include/agg_gamma_functions.h +@@ -94,7 +94,11 @@ namespace agg + { + if(x < m_start) return 0.0; + if(x > m_end) return 1.0; +- return (x - m_start) / (m_end - m_start); ++ double delta = m_end - m_start; ++ // avoid nan from potential zero division ++ // https://github.com/mapnik/mapnik/issues/761 ++ if (delta <= 0.0) return 0.0; ++ return (x - m_start) / delta; + } + + private: +-- +1.8.1.4 + diff --git a/SDK/package/agg/0015-Ensure-first-value-in-the-gamma-table-is-always-zero.patch b/SDK/package/agg/0015-Ensure-first-value-in-the-gamma-table-is-always-zero.patch new file mode 100644 index 0000000..6214bd6 --- /dev/null +++ b/SDK/package/agg/0015-Ensure-first-value-in-the-gamma-table-is-always-zero.patch @@ -0,0 +1,24 @@ +From ca818d4dcd428c5560fc3c341fbaf427a7485e32 Mon Sep 17 00:00:00 2001 +From: Tom Hughes +Date: Sat, 22 Jun 2013 12:34:37 +0100 +Subject: [PATCH 15/15] Ensure first value in the gamma table is always zero + +--- + include/agg_gamma_functions.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/include/agg_gamma_functions.h b/include/agg_gamma_functions.h +index beb0c04..b8eda52 100644 +--- a/include/agg_gamma_functions.h ++++ b/include/agg_gamma_functions.h +@@ -49,6 +49,7 @@ namespace agg + + double operator() (double x) const + { ++ if (x == 0.0) return 0.0; + return pow(x, m_gamma); + } + +-- +1.8.1.4 + diff --git a/SDK/package/agg/CREATE_FILES.patch b/SDK/package/agg/CREATE_FILES.patch new file mode 100644 index 0000000..1a78125 --- /dev/null +++ b/SDK/package/agg/CREATE_FILES.patch @@ -0,0 +1,14 @@ +--- a/README.orig 2007-01-07 13:58:28.000000000 +0000 ++++ b/README 2007-01-07 14:02:40.000000000 +0000 +@@ -0,0 +1 @@ ++cacac + +--- a/NEWS.orig 2007-01-07 13:58:28.000000000 +0000 ++++ b/NEWS 2007-01-07 14:02:40.000000000 +0000 +@@ -0,0 +1 @@ ++cacac + +--- a/AUTHORS.orig 2007-01-07 13:58:28.000000000 +0000 ++++ b/AUTHORS 2007-01-07 14:02:40.000000000 +0000 +@@ -0,0 +1 @@ ++cacac diff --git a/SDK/package/agg/Config.in b/SDK/package/agg/Config.in new file mode 100644 index 0000000..a842098 --- /dev/null +++ b/SDK/package/agg/Config.in @@ -0,0 +1,13 @@ +config BR2_PACKAGE_AGG + bool "agg" + depends on BR2_INSTALL_LIBSTDCPP + select BR2_PACKAGE_SDL + help + The Anti-Grain Geometry project. A High Quality 2D Graphics Rendering + Engine for C++. + We select the SDL backend by default. + + http://www.antigrain.com/index.html + +comment "agg needs a toolchain with C++ support" + depends on !BR2_INSTALL_LIBSTDCPP diff --git a/SDK/package/agg/agg-2.4-depends.patch b/SDK/package/agg/agg-2.4-depends.patch new file mode 100644 index 0000000..f5506e2 --- /dev/null +++ b/SDK/package/agg/agg-2.4-depends.patch @@ -0,0 +1,36 @@ +--- agg-2.4.orig/font_freetype/Makefile.am 2005-10-18 11:45:40.000000000 +0100 ++++ agg-2.4/font_freetype/Makefile.am 2006-07-10 15:11:55.000000000 +0100 +@@ -4,8 +4,9 @@ + agginclude_HEADERS = agg_font_freetype.h + lib_LTLIBRARIES = libaggfontfreetype.la + +-libaggfontfreetype_la_LDFLAGS = -version-info @AGG_LIB_VERSION@ @FREETYPE_LIBS@ ++libaggfontfreetype_la_LDFLAGS = -version-info @AGG_LIB_VERSION@ + libaggfontfreetype_la_SOURCES = agg_font_freetype.cpp + libaggfontfreetype_la_CXXFLAGS = -I$(top_srcdir)/include @FREETYPE_CFLAGS@ ++libaggfontfreetype_la_LIBADD = ../src/libagg.la @FREETYPE_LIBS@ + endif + +--- agg-2.4.orig/src/platform/sdl/Makefile.am 2005-10-17 23:49:35.000000000 +0100 ++++ agg-2.4/src/platform/sdl/Makefile.am 2006-07-10 15:11:55.000000000 +0100 +@@ -5,6 +5,6 @@ + libaggplatformsdl_la_LDFLAGS = -version-info @AGG_LIB_VERSION@ + libaggplatformsdl_la_SOURCES = agg_platform_support.cpp + libaggplatformsdl_la_CXXFLAGS = -I$(top_srcdir)/include @SDL_CFLAGS@ +-libaggplatformsdl_la_LIBADD = @SDL_LIBS@ ++libaggplatformsdl_la_LIBADD = ../../libagg.la @SDL_LIBS@ + endif + +--- agg-2.5.orig/src/platform/X11/Makefile.am 2006-12-11 00:59:45.000000000 +0000 ++++ agg-2.5/src/platform/X11/Makefile.am 2007-01-07 14:07:39.000000000 +0000 +@@ -1,8 +1,8 @@ + if ENABLE_X11 + lib_LTLIBRARIES = libaggplatformX11.la + +-libaggplatformX11_la_LDFLAGS = -version-info @AGG_LIB_VERSION@ -L@x_libraries@ ++libaggplatformX11_la_LDFLAGS = -version-info @AGG_LIB_VERSION@ @X_LDFLAGS@ + libaggplatformX11_la_SOURCES = agg_platform_support.cpp + libaggplatformX11_la_CXXFLAGS = -I$(top_srcdir)/include -I@x_includes@ +-libaggplatformX11_la_LIBADD = -lX11 ++libaggplatformX11_la_LIBADD = ../../libagg.la -lX11 + endif diff --git a/SDK/package/agg/agg-2.5-autotools.patch b/SDK/package/agg/agg-2.5-autotools.patch new file mode 100644 index 0000000..1272b65 --- /dev/null +++ b/SDK/package/agg/agg-2.5-autotools.patch @@ -0,0 +1,11 @@ +--- a/configure.in 2013-02-22 09:30:00.000000000 -0600 ++++ b/configure.in 2013-02-22 09:30:49.030777571 -0600 +@@ -8,7 +8,7 @@ + AC_PROG_CC + AC_PROG_CXX + AC_ISC_POSIX +-AM_C_PROTOTYPES ++#AM_C_PROTOTYPES + if test "x$U" != "x"; then + AC_MSG_ERROR(Compiler not ANSI compliant) + fi diff --git a/SDK/package/agg/agg-2.5-pkgconfig.patch b/SDK/package/agg/agg-2.5-pkgconfig.patch new file mode 100644 index 0000000..a303bfb --- /dev/null +++ b/SDK/package/agg/agg-2.5-pkgconfig.patch @@ -0,0 +1,10 @@ +--- agg-2.5/libagg.pc.in.orig 2007-01-07 13:58:28.000000000 +0000 ++++ agg-2.5/libagg.pc.in 2007-01-07 14:02:40.000000000 +0000 +@@ -6,5 +6,6 @@ + Name: libagg + Description: Anti Grain Geometry - A High Quality Rendering Engine for C++ + Version: @VERSION@ +-Libs: -L${libdir} -Wl,-rpath,${exec_prefix}/lib -lagg ++Requires.private: freetype2 ++Libs: -L${libdir} -lagg + Cflags: -I${includedir} diff --git a/SDK/package/agg/agg.mk b/SDK/package/agg/agg.mk new file mode 100644 index 0000000..ecf5749 --- /dev/null +++ b/SDK/package/agg/agg.mk @@ -0,0 +1,32 @@ +############################################################################### +# +# agg +# +############################################################################### + +AGG_VERSION = 2.5 +AGG_SOURCE = agg-$(AGG_VERSION).tar.gz +AGG_SITE = https://ftp.osuosl.org/pub/blfs/8.0/a +AGG_LICENSE = GPLv3+ +AGG_LICENSE_FILES = COPYING +AGG_INSTALL_STAGING = YES +AGG_AUTORECONF = YES + +AGG_DEPENDENCIES = host-pkgconf sdl + +AGG_CONF_OPTS = \ + --with-sdl-prefix=$(STAGING_DIR)/usr \ + --disable-sdltest + +AGG_CONF_OPTS += \ + --with-x=NO \ + --disable-examples --disable-gpc + +ifeq ($(BR2_PACKAGE_FREETYPE),y) +AGG_DEPENDENCIES += freetype +AGG_CONF_OPTS += --enable-freetype +else +AGG_CONF_OPTS += --disable-freetype +endif + +$(eval $(autotools-package)) diff --git a/SDK/package/dmtx-utils/0001-no-static-debug.patch b/SDK/package/dmtx-utils/0001-no-static-debug.patch new file mode 100644 index 0000000..7fbfa01 --- /dev/null +++ b/SDK/package/dmtx-utils/0001-no-static-debug.patch @@ -0,0 +1,35 @@ + dmtxquery/Makefile.am | 2 +- + dmtxread/Makefile.am | 2 +- + dmtxwrite/Makefile.am | 2 +- + 3 files changed, 3 insertions(+), 3 deletions(-) + +diff -Naur a/dmtxquery/Makefile.am b/dmtxquery/Makefile.am +--- a/dmtxquery/Makefile.am ++++ b/dmtxquery/Makefile.am +@@ -9,5 +9,5 @@ + dmtxquery_LDADD = $(LIBOBJS) + + dmtxquery_debug_SOURCES = dmtxquery.c dmtxquery.h ../common/dmtxutil.c ../common/dmtxutil.h +-dmtxquery_debug_LDFLAGS = -static $(DMTX_LIBS) ++dmtxquery_debug_LDFLAGS = $(DMTX_LIBS) + dmtxquery_debug_LDADD = $(LIBOBJS) +diff -Naur a/dmtxread/Makefile.am b/dmtxread/Makefile.am +--- a/dmtxread/Makefile.am ++++ b/dmtxread/Makefile.am +@@ -11,5 +11,5 @@ + + dmtxread_debug_SOURCES = dmtxread.c dmtxread.h ../common/dmtxutil.c ../common/dmtxutil.h + dmtxread_debug_CFLAGS = $(DMTX_CFLAGS) $(MAGICK_CFLAGS) -D_MAGICK_CONFIG_H +-dmtxread_debug_LDFLAGS = -static $(DMTX_LIBS) $(MAGICK_LIBS) ++dmtxread_debug_LDFLAGS = $(DMTX_LIBS) $(MAGICK_LIBS) + dmtxread_debug_LDADD = $(LIBOBJS) +diff -Naur a/dmtxwrite/Makefile.am b/dmtxwrite/Makefile.am +--- a/dmtxwrite/Makefile.am ++++ b/dmtxwrite/Makefile.am +@@ -11,5 +11,5 @@ + + dmtxwrite_debug_SOURCES = dmtxwrite.c dmtxwrite.h ../common/dmtxutil.c ../common/dmtxutil.h + dmtxwrite_debug_CFLAGS = $(DMTX_FLAGS) $(MAGICK_CFLAGS) -D_MAGICK_CONFIG_H +-dmtxwrite_debug_LDFLAGS = -static $(DMTX_LIBS) $(MAGICK_LIBS) ++dmtxwrite_debug_LDFLAGS = $(DMTX_LIBS) $(MAGICK_LIBS) + dmtxwrite_debug_LDADD = $(LIBOBJS) diff --git a/SDK/package/dmtx-utils/Config.in b/SDK/package/dmtx-utils/Config.in new file mode 100644 index 0000000..0d22f43 --- /dev/null +++ b/SDK/package/dmtx-utils/Config.in @@ -0,0 +1,12 @@ +config BR2_PACKAGE_DMTX_UTILS + bool "dmtx utils" + depends on BR2_PACKAGE_LIBDMTX + select BR2_PACKAGE_IMAGEMAGICK + help + libdmtx is a software library that enables programs to read + and write Data Matrix barcodes of the modern ECC200 + variety. This package, dmtx-utils, provides command line + utilities that allow scripts and command line users to use + libdmtx functionality. + + https://github.com/dmtx/dmtx-utils diff --git a/SDK/package/dmtx-utils/dmtx-utils.hash b/SDK/package/dmtx-utils/dmtx-utils.hash new file mode 100644 index 0000000..0b9bf1c --- /dev/null +++ b/SDK/package/dmtx-utils/dmtx-utils.hash @@ -0,0 +1,3 @@ +# Locally computed: +sha256 0d396ec14f32a8cf9e08369a4122a16aa2e5fa1675e02218f16f1ab777ea2a28 dmtx-utils-0.7.6.tar.gz +sha256 d8c320ffc0030d1b096ae4732b50d2b811cf95e9a9b7377c1127b2563e0a0388 COPYING diff --git a/SDK/package/dmtx-utils/dmtx-utils.mk b/SDK/package/dmtx-utils/dmtx-utils.mk new file mode 100644 index 0000000..4560b59 --- /dev/null +++ b/SDK/package/dmtx-utils/dmtx-utils.mk @@ -0,0 +1,20 @@ +################################################################################ +# +# dmtx-utils +# +################################################################################ + +DMTX_UTILS_VERSION = 0.7.6 +DMTX_UTILS_SITE = $(call github,dmtx,dmtx-utils,v$(DMTX_UTILS_VERSION)) +DMTX_UTILS_DEPENDENCIES = libdmtx imagemagick +DMTX_UTILS_LICENSE = LGPL-2.1+ +DMTX_UTILS_LICENSE_FILES = COPYING +# github tarball does not include configure +DMTX_UTILS_AUTORECONF = YES + +define DMTX_UTILS_RUN_AUTOGEN + cd $(@D) && PATH=$(BR_PATH) ./autogen.sh +endef +DMTX_UTILS_PRE_CONFIGURE_HOOKS += DMTX_UTILS_RUN_AUTOGEN + +$(eval $(autotools-package)) diff --git a/SDK/package/fluidlite/0001-fluidlite.patch b/SDK/package/fluidlite/0001-fluidlite.patch new file mode 100644 index 0000000..9dc01c4 --- /dev/null +++ b/SDK/package/fluidlite/0001-fluidlite.patch @@ -0,0 +1,11 @@ +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -125,7 +125,7 @@ + endif() + endif() + +-option(FLUIDLITE_BUILD_SHARED "Build shared library" TRUE) ++option(FLUIDLITE_BUILD_SHARED "Build shared library" FALSE) + if(FLUIDLITE_BUILD_SHARED) + add_library(${PROJECT_NAME} SHARED ${SOURCES}) + diff --git a/SDK/package/fluidlite/Config.in b/SDK/package/fluidlite/Config.in new file mode 100644 index 0000000..bdfb7e0 --- /dev/null +++ b/SDK/package/fluidlite/Config.in @@ -0,0 +1,5 @@ +config BR2_PACKAGE_FLUIDLITE + bool "FluidLite" + depends on BR2_USE_MMU + help + FluidLite is a very light version of FluidSynth. diff --git a/SDK/package/fluidlite/fluidsynth.hash b/SDK/package/fluidlite/fluidsynth.hash new file mode 100644 index 0000000..d39b0e8 --- /dev/null +++ b/SDK/package/fluidlite/fluidsynth.hash @@ -0,0 +1,3 @@ +# Locally computed +sha256 ef4d008f9fe2fa9a48135505d42dd7e8e9cc4d7494a4b13d6caa13adb5c61ff8 1.0.tar.gz +sha256 a5564e99fd7f49e1344152a8c5bc1d420d5f973b30e010946764db0b5b9e668c LICENSE diff --git a/SDK/package/fluidlite/fluidsynth.mk b/SDK/package/fluidlite/fluidsynth.mk new file mode 100644 index 0000000..5cd24b7 --- /dev/null +++ b/SDK/package/fluidlite/fluidsynth.mk @@ -0,0 +1,19 @@ +################################################################################ +# +# FLUIDLITE +# +################################################################################ + +FLUIDLITE_VERSION = fdd05bad03cdb24d1f78b5fe3453842890c1b0e8 +FLUIDLITE_SITE = $(call github,gcw0,FluidLite,$(FLUIDLITE_VERSION)) +FLUIDLITE_LICENSE = LGPL-2.1+ +FLUIDLITE_LICENSE_FILES = LICENSE +FLUIDLITE_INSTALL_STAGING = YES +FLUIDLITE_DEPENDENCIES = + +# Disable the shared library for static only build +ifeq ($(BR2_STATIC_LIBS),y) +FLUIDLITE_CONF_OPTS += -DDFLUIDLITE_BUILD_SHARED=FALSE +endif + +$(eval $(cmake-package)) diff --git a/SDK/package/libini/Config.in b/SDK/package/libini/Config.in new file mode 100644 index 0000000..a42967b --- /dev/null +++ b/SDK/package/libini/Config.in @@ -0,0 +1,6 @@ +config BR2_PACKAGE_LIBINI + bool "libini" + help + Tiny library to help reading INI files. + + https://github.com/pcercuei/libini diff --git a/SDK/package/libini/libini.mk b/SDK/package/libini/libini.mk new file mode 100644 index 0000000..7d50b25 --- /dev/null +++ b/SDK/package/libini/libini.mk @@ -0,0 +1,13 @@ +############################################################# +# +# libini +# +############################################################# +LIBINI_VERSION = c3413da +LIBINI_SITE_METHOD = git +LIBINI_SITE = https://github.com/FunKey-Project/libini.git +LIBINI_LICENSE = LGPL-2.1 + +LIBINI_INSTALL_STAGING = YES + +$(eval $(cmake-package)) diff --git a/SDK/package/libmikmod/Config.in b/SDK/package/libmikmod/Config.in new file mode 100644 index 0000000..909cc00 --- /dev/null +++ b/SDK/package/libmikmod/Config.in @@ -0,0 +1,7 @@ +config BR2_PACKAGE_LIBMIKMOD + bool "libmikmod" + help + Mikmod is a module player and library supporting many + tracker formats, including mod, s3m, it, and xm. + + http://mikmod.shlomifish.org/ diff --git a/SDK/package/libmikmod/libmikmod.mk b/SDK/package/libmikmod/libmikmod.mk new file mode 100644 index 0000000..cce5190 --- /dev/null +++ b/SDK/package/libmikmod/libmikmod.mk @@ -0,0 +1,21 @@ +############################################################# +# +# libmikmod +# +############################################################# +LIBMIKMOD_VERSION:=3.3.11.1 +LIBMIKMOD_SITE:=http://sourceforge.net/projects/mikmod/files/libmikmod/$(LIBMIKMOD_VERSION) + +LIBMIKMOD_CONF_OPTS = --localstatedir=/var + +LIBMIKMOD_LIBTOOL_PATCH = NO +LIBMIKMOD_INSTALL_STAGING = YES + +LIBMIKMOD_CONFIG_SCRIPTS = libmikmod-config + +define LIBMIKMOD_REMOVE_LIBMIKMOD_CONFIG +mv $(TARGET_DIR)/usr/bin/libmikmod-config $(HOST_DIR)/bin/ +endef +LIBMIKMOD_POST_INSTALL_TARGET_HOOKS += LIBMIKMOD_REMOVE_LIBMIKMOD_CONFIG + +$(eval $(autotools-package)) diff --git a/SDK/package/libopk/Config.in b/SDK/package/libopk/Config.in new file mode 100644 index 0000000..db3bce9 --- /dev/null +++ b/SDK/package/libopk/Config.in @@ -0,0 +1,8 @@ +config BR2_PACKAGE_LIBOPK + bool "libopk" + select BR2_PACKAGE_ZLIB + select BR2_PACKAGE_LIBINI + help + Library to handle OPK packages. + + https://github.com/pcercuei/libopk diff --git a/SDK/package/libopk/libopk.mk b/SDK/package/libopk/libopk.mk new file mode 100644 index 0000000..8cf840a --- /dev/null +++ b/SDK/package/libopk/libopk.mk @@ -0,0 +1,14 @@ +############################################################# +# +# libopk +# +############################################################# +LIBOPK_VERSION = 3c918c8 +LIBOPK_SITE_METHOD = git +LIBOPK_SITE = https://github.com/FunKey-Project/libopk.git + +LIBOPK_DEPENDENCIES = libini zlib + +LIBOPK_INSTALL_STAGING = YES + +$(eval $(cmake-package)) diff --git a/SDK/package/libxdgmime/Config.in b/SDK/package/libxdgmime/Config.in new file mode 100644 index 0000000..f1d4fc0 --- /dev/null +++ b/SDK/package/libxdgmime/Config.in @@ -0,0 +1,12 @@ +config BR2_PACKAGE_LIBXDGMIME + bool "libxdgmime" + depends on BR2_USE_WCHAR # shared-mime-info + select BR2_PACKAGE_SHARED_MIME_INFO + help + Simple library that parses the proposed MIME spec + listed at http://freedesktop.org/. + + https://github.com/pcercuei/libxdgmime + +comment "libxdgmime requires a toolchain with WCHAR support" + depends on !BR2_USE_WCHAR diff --git a/SDK/package/libxdgmime/libxdgmime.mk b/SDK/package/libxdgmime/libxdgmime.mk new file mode 100644 index 0000000..c6e592f --- /dev/null +++ b/SDK/package/libxdgmime/libxdgmime.mk @@ -0,0 +1,30 @@ +############################################################# +# +# libxdgmime +# +############################################################# +LIBXDGMIME_VERSION = db79e7c +LIBXDGMIME_SITE_METHOD = git +LIBXDGMIME_SITE = https://github.com/FunKey-Project/libxdgmime.git +LIBXDGMIME_DEPENDENCIES = shared-mime-info +LIBXDGMIME_LICENCE = LGPL-2.1+ or AFL-2.1 + +LIBXDGMIME_INSTALL_STAGING = YES + +LIBXDGMIME_MAKE_ENV = CFLAGS="$(TARGET_CFLAGS)" LDFLAGS="$(TARGET_LDFLAGS)" \ + CROSS_COMPILE="$(TARGET_CROSS)" PREFIX=/usr \ + PLATFORM="$(BR2_VENDOR)" + +define LIBXDGMIME_BUILD_CMDS + $(LIBXDGMIME_MAKE_ENV) $(MAKE) -C $(@D) +endef + +define LIBXDGMIME_INSTALL_STAGING_CMDS + $(LIBXDGMIME_MAKE_ENV) DESTDIR="$(STAGING_DIR)" $(MAKE) -C $(@D) install +endef + +define LIBXDGMIME_INSTALL_TARGET_CMDS + $(LIBXDGMIME_MAKE_ENV) DESTDIR="$(TARGET_DIR)" $(MAKE) -C $(@D) install-lib +endef + +$(eval $(generic-package)) diff --git a/buildroot b/buildroot index d5bc4aa..d526838 160000 --- a/buildroot +++ b/buildroot @@ -1 +1 @@ -Subproject commit d5bc4aaa1e6ab0e2cda437e993a0dcbe745f246b +Subproject commit d5268384884357d5f17515ee083939f9596a2e73